#################################################################
## Replication codde for Strange, Enos, Hill, and Lakeman (2019)
## "Online Volunteer Laboratories for Human Subjects Research"
## 
## When run, the following code produces Figures 1 and 2 from the main text, Table 1 from the main text,
## Figures A and B from S1 Appendix, Tables A, B, and C from S2 Appendix, and Tables A, B, C, and D
## from the S3 Appendix, Tables A and B from the S4 Appendix, and Figures A, B, and D from the 
## S4 Appendix
##
#################################################################

# load required libraries and source code for replication
library(foreign)
library(plyr)
library(dplyr)
library(dotwhisker)
library(sjstats)
library(ggsn)
library(gridExtra)
library(sp)
library(maps)
library(maptools)
library(dummies)
library(tables)
library(xtable)
library(Hmisc)
library(prettyR)
library(weights)
#devtools::install_github("dkahle/ggmap")
library(ggmap)
library(ggplot2)
library(grid)
library(stargazer)
library(MCMCpack)
library(Rmisc)
library(stats)
library(grid)
library(cowplot)

# set working directory to folder with datasets folder and source_files folder on your computer
#setwd("...")

# source file that recodes raw Qualtrics values to meaningful values
source("source_files/recode_demographics.R")
# source file with functions that help merge and code the data
source("source_files/table_replication_functions.R")

# prevent scientific notation
options(scipen=999)

##########################
## Figure 1, Main paper ##
##########################

# read cleaned motivation data for Figure 1
dat <- read.csv("datasets/Figure_1_cleaned_data.csv")


# the following process creates a new data.frame to be used for plotting the means of the
# dlabss and mturk groups for each question regarding participation motivation

# creates a row with the question categories, one for each group
Category <- c(rep("Help Researchers",2),rep("Volunteer Time",2),rep("Experience Studies",2),
              rep("Current Affairs",2),rep("Pass Time",2),
              rep("Community",2), rep("Earn Money",2))

# creates a row to indicate dlabss or mturk group
Group <- rep(c(1,0),7)

# finds the dlabss or mturk mean for each question 
Means <- c(mean(dat$help.researchers[dat$volunteer==1], na.rm = T), mean(dat$help.researchers[dat$volunteer==0], na.rm = T),
           mean(dat$volunteer.time[dat$volunteer==1], na.rm = T), mean(dat$volunteer.time[dat$volunteer==0], na.rm = T),
           mean(dat$experience.studies[dat$volunteer==1], na.rm = T), mean(dat$experience.studies[dat$volunteer==0], na.rm = T),
           mean(dat$current.affairs[dat$volunteer==1], na.rm = T), mean(dat$current.affairs[dat$volunteer==0], na.rm = T),
           mean(dat$pass.time[dat$volunteer==1], na.rm = T), mean(dat$pass.time[dat$volunteer==0], na.rm = T),
           mean(dat$community[dat$volunteer==1], na.rm = T), mean(dat$community[dat$volunteer==0], na.rm = T),
           mean(dat$earn.money[dat$volunteer==1], na.rm = T), mean(dat$earn.money[dat$volunteer==0], na.rm = T)
)

# finds the standard error for each mean calculated above using a function from the 
# recode_demographics source file
SEs <- c(SE(dat$help.researchers[dat$volunteer==1]), SE(dat$help.researchers[dat$volunteer==0]),
         SE(dat$volunteer.time[dat$volunteer==1]), SE(dat$volunteer.time[dat$volunteer==0]),
         SE(dat$experience.studies[dat$volunteer==1]), SE(dat$experience.studies[dat$volunteer==0]),
         SE(dat$current.affairs[dat$volunteer==1]), SE(dat$current.affairs[dat$volunteer==0]),
         SE(dat$pass.time[dat$volunteer==1]), SE(dat$pass.time[dat$volunteer==0]),
         SE(dat$community[dat$volunteer==1]), SE(dat$community[dat$volunteer==0]),
         SE(dat$earn.money[dat$volunteer==1]), SE(dat$earn.money[dat$volunteer==0])
)

# combines question category, group, means, and standard deviations from above into a data.frame
group.dat <- data.frame(Category, Group, Means, SEs)

# make group and question category factors for plotting
group.dat$Group <- factor(group.dat$Group, levels = c(0,1))
group.dat$Category <- factor(group.dat$Category, levels = rev(unique(group.dat$Category)))

# the following code outputs a .png file of Figure 1 to the current working directory
png(filename = "Fig1.png", width = 1250, height = 700)
ggplot(group.dat,aes(x=Category,y=Means,fill=Group))+
  geom_bar(stat="identity",position="dodge")+
  scale_fill_manual(name="",
                    breaks=c(1, 0),
                    labels=c("DLABSS", "MTurk"),
                    values = c(alpha("grey65", 0.65),alpha("grey10", 0.65)))+
  xlab("Motivation")+ylab("Mean Rating")+
  theme_bw()+
  theme(axis.text = element_text(size = 20))+
  theme(axis.title = element_text(size = 20))+
  theme(legend.text = element_text(size = 20))+
  theme(legend.title = element_text(size = 20))+
  theme(axis.text.y = element_text(angle = 33, hjust = 1))+
  geom_errorbar(aes(ymin=Means-SEs, ymax=Means+SEs),
                width=.2,
                position=position_dodge(.9))+
  coord_flip()
dev.off()

# remove datasets required to make Figure 1
rm(dat); rm(group.dat)

##################
## End Figure 1 ##
##################

#########################
## Table 1, Main paper ##
#########################

## Each column is constructed incrementally to be put together as a latex table at the end using xtable

# read in required datasets
dlabss_anesp <- read.csv("datasets/dlabss_anesp_replication.csv")
dlabss_master <- read.csv("datasets/dlabss_master.csv")
dlabss_states <- read.csv("datasets/dlabss_region_data.csv")
mturk_anesp <- read.csv("datasets/mturk_anesp_replication.csv")
mturk_states <- read.csv("datasets/mturk_region_data.csv")

## Load CPS Basic Monthly November 2012 data
cps <- read.csv("datasets/cps_nov_12.csv")

## load in ANES 2012 Time Series data, rename, and remove old version
load(file = "datasets/anes_2012.rda")
anes12 <- da35157.0001 ; rm(da35157.0001)

## Column 1: DLABSS demographic data ##

dlabss_means <- rep(NA, 24)
dlabss_ses <- rep(NA, 24)

# DLABSS female  mean and sd x100 to be put on percent scale
dlabss_means[1] <- round(mean(dlabss_master$Gender==2, na.rm = T)*100,1)
dlabss_ses[1] <- round(SE.prop(dlabss_master$Gender==2)*100,1)

length(na.omit(dlabss_master$Gender))

# reordering variables so they are in the desired order
table_order_1 <- c("Education", "age.years", "income")

# holder variable for initial ordering
temp <- dlabss_master[,table_order_1]

# holder for means to be appended to other
mean.holder <- apply(temp, 2, mean, na.rm=TRUE)

# holder for standard error to be appended to other
se.holder <- apply(temp, 2, SE)

# combine means up to this point
dlabss_means[2:3] <-  round(mean.holder[1:2],1)
dlabss_means[4] <- round(mean.holder[3],0)

# combine SEs up to this point
dlabss_ses[2:3] <- round(se.holder[1:2],1)
dlabss_ses[4] <- round(se.holder[3],0)

# dlabss median income
dlabss_means[5] <- median(dlabss_master$income,na.rm = T)

dlabss_ses[5] <- ""

## all race x100 to be put on percent scale
## DLABSS race white
dlabss_means[6] <- round(mean(dlabss_master$Race=="white",na.rm=T)*100,1)
dlabss_ses[6] <- round(SE.prop(dlabss_master$Race=="white")*100,1)

## DLABSS race black
dlabss_means[7] <- round(mean(dlabss_master$Race=="black",na.rm=T)*100,1)
dlabss_ses[7] <- round(SE.prop(dlabss_master$Race=="black")*100,1)

## DLABSS race hispanic
dlabss_means[8] <- round(mean(dlabss_master$Race=="hispanic",na.rm=T)*100,1)
dlabss_ses[8] <- round(SE.prop(dlabss_master$Race=="hispanic")*100,1)

# DLABSS marital status x100 to be put on percent scale
# use dlabss_anesp here because marital status data is from anesp replication
dlabss_means[9] <- round(mean(dlabss_anesp$marital=="married",na.rm = T)*100,1)
dlabss_ses[9] <- round(SE.prop(dlabss_anesp$marital=="married")*100,1)

# DLABSS home ownership x100 to be put on percent scale
# use dlabss_anesp here because housing data is from anesp replication
dlabss_means[10] <- round(mean(dlabss_anesp$housing=="own",na.rm = T)*100,1)
dlabss_ses[10] <- round(SE.prop(dlabss_anesp$housing=="own")*100,1)

## DLABSS religion all x100 to be put on percent scale
# use dlabss_anesp here because this data is from anesp replication

# no religion
dlabss_means[11] <- round(mean(dlabss_anesp$religion=="none",na.rm = T)*100,1)
dlabss_ses[11] <- round(SE.prop(dlabss_anesp$religion=="none")*100,1)

# Protestant
dlabss_means[12] <- round(mean(dlabss_anesp$religion=="protestant",na.rm = T)*100,1)
dlabss_ses[12] <- round(SE.prop(dlabss_anesp$religion=="protestant")*100,1)

# Catholic
dlabss_means[13] <- round(mean(dlabss_anesp$religion=="catholic",na.rm = T)*100,1)
dlabss_ses[13] <- round(SE.prop(dlabss_anesp$religion=="catholic")*100,1)

# DLABSS region data comes from dlabss_states dataset
# all x100 to be put on percent scale

## northeast
dlabss_means[14] <- round(mean(dlabss_states$regionnortheast)*100,1)
dlabss_ses[14] <- round(SE.prop(dlabss_states$regionnortheast)*100,1)

## midwest
dlabss_means[15] <- round(mean(dlabss_states$regionmidwest)*100,1)
dlabss_ses[15] <- round(SE.prop(dlabss_states$regionmidwest)*100,1)

## south
dlabss_means[16] <- round(mean(dlabss_states$regionsouth)*100,1)
dlabss_ses[16] <- round(SE.prop(dlabss_states$regionsouth)*100,1)

## west
dlabss_means[17] <- round(mean(dlabss_states$regionwest)*100,1)
dlabss_ses[17] <- round(SE.prop(dlabss_states$regionwest)*100,1)

# DLABSS self-reported party identification from master file
## create a party ID variable from numeric codes
dlabss_master$partyID <- NA
dlabss_master$partyID[dlabss_master$Party == 1] <- "Democrat"
dlabss_master$partyID[dlabss_master$Party == 2] <- "Republican"
dlabss_master$partyID[dlabss_master$Party == 3] <- "Independent"

# all x100 to be put on percent scale
# democrat
dlabss_means[18] <- round(mean(dlabss_master$partyID=='Democrat',na.rm = T)*100,1)
dlabss_ses[18] <- round(SE.prop(dlabss_master$partyID=='Democrat')*100,1)

# independent/other
dlabss_means[19] <- round(mean(dlabss_master$partyID=='Independent',na.rm = T)*100,1)
dlabss_ses[19] <- round(SE.prop(dlabss_master$partyID=='Independent')*100,1)

# republican
dlabss_means[20] <- round(mean(dlabss_master$partyID=='Republican',na.rm = T)*100,1)
dlabss_ses[20] <- round(SE.prop(dlabss_master$partyID=='Republican')*100,1)

# DLABSS self reported ideology from master file
# create ideology variable from numeric code

# in this case the blanks in the ideology column are being read in as NA
# but this not what we want because we want the proportion of people
# who identify as liberal or conservative relative to the whole panel,
# not just those who identified as one or the other. Therefore, NAs in this
# column are reverted back to their original "blank" value
dlabss_master$Ideology[is.na(dlabss_master$Ideology)] <- ""


dlabss_master$Ideology[dlabss_master$Ideology=='1'] <- 'Liberal'
dlabss_master$Ideology[dlabss_master$Ideology=='2'] <- 'Conservative'
dlabss_master$Ideology[dlabss_master$Ideology==''] <- 'Other'

# all x100 to be put on percent scale
# liberal
dlabss_means[21] <- round(mean(dlabss_master$Ideology=='Liberal',na.rm = T)*100,1)
dlabss_ses[21] <- round(SE.prop(dlabss_master$Ideology=='Liberal')*100,1)

# conservative
dlabss_means[22] <- round(mean(dlabss_master$Ideology=='Conservative',na.rm = T)*100,1)
dlabss_ses[22] <- round(SE.prop(dlabss_master$Ideology=='Conservative')*100,1)

## DLABSS voter registration from anesp replication data
## change "I don't know" (99) to did not vote consistent with Berinsky et al. (2012)
dlabss_anesp$registered[dlabss_anesp$registered == 99] <- 0

# all x100 to be put on percent scale
# voter registration
dlabss_means[23] <- round(mean(dlabss_anesp$registered, na.rm = T)*100,1)
dlabss_ses[23] <- round(SE.prop(dlabss_anesp$registered)*100,1)

## change "I don't know" (99) to did not vote consistent with Berinsky et al. (2012)
dlabss_anesp$turnout[dlabss_anesp$turnout == 99] <- 0

# voter turnout 2008
dlabss_means[24] <- round(mean(dlabss_anesp$turnout, na.rm = T)*100,1)
dlabss_ses[24] <- round(SE.prop(dlabss_anesp$turnout)*100,1)

# put means and SEs together nicely for table column
dl_msd <- paste(dlabss_means," (",dlabss_ses,")",sep="")
# add N row
dl_msd[25] <- "909-8,122"

## Column 2: mTurk demographic data from mTurk survey we conducted ##

mturk_means <- rep(NA, 24)
mturk_ses <- rep(NA, 24)

# mturk female x100 for percent scale
mturk_means[1] <- round(mean(mturk_anesp$female, na.rm = T)*100,1)
mturk_ses[1] <- round(SE.prop(mturk_anesp$female)*100,1)

# mturk Education years
mturk_means[2] <- round(mean(mturk_anesp$Education, na.rm = T),1)
mturk_ses[2] <- round(SE(mturk_anesp$Education),1)

# mturk age years
mturk_means[3] <- round(mean(mturk_anesp$age.years, na.rm = T),1)
mturk_ses[3] <- round(SE(mturk_anesp$age.years),1)

# mturk mean and median income
mturk_means[4] <- round(mean(mturk_anesp$income,na.rm=T),0)
mturk_ses[4] <- round(SE(mturk_anesp$income),0)

mturk_means[5] <- median(mturk_anesp$income,na.rm = T)
mturk_ses[5] <- ""

## all race x100 to be put on percent scale
## mturk race white
mturk_means[6] <- round(mean(mturk_anesp$Race=="white",na.rm=T)*100,1)
mturk_ses[6] <- round(SE.prop(mturk_anesp$Race=="white")*100,1)

## mturk race black
mturk_means[7] <- round(mean(mturk_anesp$Race=="black",na.rm=T)*100,1)
mturk_ses[7] <- round(SE.prop(mturk_anesp$Race=="black")*100,1)

## mturk race hispanic
mturk_means[8] <- round(mean(mturk_anesp$Race=="hispanic",na.rm=T)*100,1)
mturk_ses[8] <- round(SE.prop(mturk_anesp$Race=="hispanic")*100,1)

# mturk marital status x100 to be put on percent scale
mturk_means[9] <- round(mean(mturk_anesp$marital=="married",na.rm = T)*100,1)
mturk_ses[9] <- round(SE.prop(mturk_anesp$marital=="married")*100,1)

# mturk home ownership x100 to be put on percent scale
mturk_means[10] <- round(mean(mturk_anesp$housing=="own",na.rm = T)*100,1)
mturk_ses[10] <- round(SE.prop(mturk_anesp$housing=="own")*100,1)

## mturk religion all x100 to be put on percent scale

# no religion
mturk_means[11] <- round(mean(mturk_anesp$religion=="none",na.rm = T)*100,1)
mturk_ses[11] <- round(SE.prop(mturk_anesp$religion=="none")*100,1)

# Protestant
mturk_means[12] <- round(mean(mturk_anesp$religion=="protestant",na.rm = T)*100,1)
mturk_ses[12] <- round(SE.prop(mturk_anesp$religion=="protestant")*100,1)

# Catholic
mturk_means[13] <- round(mean(mturk_anesp$religion=="catholic",na.rm = T)*100,1)
mturk_ses[13] <- round(SE.prop(mturk_anesp$religion=="catholic")*100,1)

# mturk region data comes from mturk_states dataset
# all x100 to be put on percent scale

## northeast
mturk_means[14] <- round(mean(mturk_states$regionnortheast)*100,1)
mturk_ses[14] <- round(SE.prop(mturk_states$regionnortheast)*100,1)

## midwest
mturk_means[15] <- round(mean(mturk_states$regionmidwest)*100,1)
mturk_ses[15] <- round(SE.prop(mturk_states$regionmidwest)*100,1)

## south
mturk_means[16] <- round(mean(mturk_states$regionsouth)*100,1)
mturk_ses[16] <- round(SE.prop(mturk_states$regionsouth)*100,1)

## west
mturk_means[17] <- round(mean(mturk_states$regionwest)*100,1)
mturk_ses[17] <- round(SE.prop(mturk_states$regionwest)*100,1)

## mturk self-reported party identification

# democrat
mturk_means[18] <- round(mean(mturk_anesp$par.1=="democrat",na.rm=T)*100,1)
mturk_ses[18] <- round(SE.prop(mturk_anesp$par.1=="democrat")*100,1)

# independent
mturk_means[19] <- round(mean(mturk_anesp$par.1=="independent",na.rm=T)*100,1)
mturk_ses[19] <- round(SE.prop(mturk_anesp$par.1=="independent")*100,1)

# republican
mturk_means[20] <- round(mean(mturk_anesp$par.1=="republican",na.rm=T)*100,1)
mturk_ses[20] <- round(SE.prop(mturk_anesp$par.1=="republican")*100,1)

## mturk self reported ideology

# first create empty vector to store ideologies derived from scale question
# if respondents did not say they were liberal or conservative, they were asked
# which way they leaned which is scale.4 in this case

mturk_anesp$Ideology <- NA
mturk_anesp$Ideology[mturk_anesp$scale.1 == '1'] <- 'Liberal'
mturk_anesp$Ideology[mturk_anesp$scale.1 == '3' & 
                       mturk_anesp$scale.4=='1'] <- 'Liberal'
mturk_anesp$Ideology[mturk_anesp$scale.1 == '2'] <- 'Conservative'
mturk_anesp$Ideology[mturk_anesp$scale.1 == '3' & 
                       mturk_anesp$scale.4=='2'] <- 'Conservative'

# mturk liberal
mturk_means[21] <- round(mean(mturk_anesp$Ideology=='Liberal', na.rm = T)*100,1)
mturk_ses[21] <- round(SE.prop(mturk_anesp$Ideology=='Liberal')*100,1)

# mtuek conservative
mturk_means[22] <- round(mean(mturk_anesp$Ideology=='Conservative', na.rm = T)*100,1)
mturk_ses[22] <- round(SE.prop(mturk_anesp$Ideology=='Conservative')*100,1)

## mturk voter registration from anesp replication data
## change "I don't know" (99) to did not vote consistent with Berinsky et al. (2012)
mturk_anesp$registered[mturk_anesp$registered == 99] <- 0

# all x100 to be put on percent scale
# voter registration
mturk_means[23] <- round(mean(mturk_anesp$registered, na.rm = T)*100,1)
mturk_ses[23] <- round(SE.prop(mturk_anesp$registered)*100,1)

## change "I don't know" (99) to did not vote consistent with Berinsky et al. (2012)
mturk_anesp$turnout[mturk_anesp$turnout == 99] <- 0

# voter turnout 2008
mturk_means[24] <- round(mean(mturk_anesp$turnout, na.rm = T)*100,1)
mturk_ses[24] <- round(SE.prop(mturk_anesp$turnout)*100,1)

# put means and SEs together nicely for table column
mt_msd <- paste(mturk_means," (",mturk_ses,")",sep="")
# add N row
mt_msd[25] <- "673-705"


## Column 3: ANESP demographic data from Berinsky et al (2012) ##

## NOTE: As the ANESP results were taken directly from Berinsky et al (2012) and not a survey that
## we personally conducted, they are hard coded into the table

anesp_means <- rep(NA,25)
anesp_ses <- rep(NA,25)

anesp_means[1] <- 57.6
anesp_ses[1] <- 0.9

anesp_means[2] <- 16.2
anesp_ses[2] <- 0.1

anesp_means[3] <- 49.7
anesp_ses[3] <- 0.3

anesp_means[4] <- 69043
anesp_ses[4] <- 749

anesp_means[5] <- 67500
anesp_ses[5] <- ""

anesp_means[6] <- 83.0
anesp_ses[6] <- 0.7

anesp_means[7] <- 8.9
anesp_ses[7] <- 0.7

anesp_means[8] <- 5.0 
anesp_ses[8] <- 0.4

anesp_means[9] <- 56.8
anesp_ses[9] <- 0.9

anesp_means[10] <- 80.8
anesp_ses[10] <- 0.8

anesp_means[11] <- 13.1
anesp_ses[11] <- 0.8

anesp_means[12] <- 38.7
anesp_ses[12] <- 1.4

anesp_means[13] <- 22.9
anesp_ses[13] <- 1.0

anesp_means[14] <- 16.9
anesp_ses[14] <- 0.7

anesp_means[15] <- 28.3
anesp_ses[15] <- 0.9

anesp_means[16] <- 31.4
anesp_ses[16] <- 0.9

anesp_means[17] <- 23.4
anesp_ses[17] <- 0.8

anesp_means[23] <- 92.0
anesp_ses[23] <- 0.7

anesp_means[24] <- 89.8
anesp_ses[24] <- 0.5

# put means and SEs together nicely for table column
anesp_msd <- paste(anesp_means," (",anesp_ses,")",sep="")
# add N row
anesp_msd[25] <- "2,727-3,003"

# make sure blank rows are blank instead of NA for table
anesp_msd[18:22] <- ""


## Column 4: Current Population Survey Basic Monthly Nov. 2012 demographic data ##

## subset data to 18 years or older and recode to use values

# age years to numeric
cps$age.years <- as.numeric(cps$AGE)

# subset to >= 18
cps <- cps[cps$age.years >= 18,]

## female indicator
cps$female <- 0
cps$female[cps$SEX == 2] <- 1

# years of education; based on highest degree achieved, average years of education
# required for that degree imputed
cps$edu.years <- 0
cps$edu.years[cps$EDUC == 1] <- NA
cps$edu.years[cps$EDUC == 10] <- 4
cps$edu.years[cps$EDUC == 20] <- 6
cps$edu.years[cps$EDUC == 30] <- 8
cps$edu.years[cps$EDUC == 40] <- 9
cps$edu.years[cps$EDUC == 50] <- 10
cps$edu.years[cps$EDUC == 60] <- 11
cps$edu.years[cps$EDUC == 71] <- 11.5
cps$edu.years[cps$EDUC == 73] <- 12
cps$edu.years[cps$EDUC == 81] <- 13
cps$edu.years[cps$EDUC == 91] <- 14
cps$edu.years[cps$EDUC == 92] <- 14
cps$edu.years[cps$EDUC == 111] <- 16
cps$edu.years[cps$EDUC == 123] <- 18
cps$edu.years[cps$EDUC == 124] <- 20
cps$edu.years[cps$EDUC == 125] <- 20

## householder family income as median of provided range of income, i.e. 5000 to 10000 = 7500 in this case
cps$income <- NA
cps$income[cps$FAMINC == 100] <- 2500
cps$income[cps$FAMINC == 210] <- 6250
cps$income[cps$FAMINC == 300] <- 8750
cps$income[cps$FAMINC == 430] <- 11250
cps$income[cps$FAMINC == 470] <- 13750
cps$income[cps$FAMINC == 500] <- 17500
cps$income[cps$FAMINC == 600] <- 22500
cps$income[cps$FAMINC == 710] <- 27500
cps$income[cps$FAMINC == 720] <- 32500
cps$income[cps$FAMINC == 730] <- 37500
cps$income[cps$FAMINC == 740] <- 45000
cps$income[cps$FAMINC == 820] <- 55000
cps$income[cps$FAMINC == 830] <- 67500
cps$income[cps$FAMINC == 841] <- 87500
cps$income[cps$FAMINC == 842] <- 125000
cps$income[cps$FAMINC == 843] <- 150000

## race

# indicator for white
cps$white <- 0
cps$white[cps$RACE == 100] <- 1

# indicator for black
cps$black <- 0
cps$black[cps$RACE == 200] <- 1

# indicator for hispanic
cps$hispanic <- 0
cps$hispanic[cps$HISPAN == 100 | cps$HISPAN == 410 | cps$HISPAN == 400 |
               cps$HISPAN == 200 | cps$HISPAN == 300] <- 1


## marital status recode
cps$marital <- NA
cps$marital[cps$MARST == 1 | cps$MARST == 2] <- "married"
cps$marital[cps$MARST == 3] <- "separated"
cps$marital[cps$MARST == 4] <- "divorced"
cps$marital[cps$MARST == 5] <- "widowed"
cps$marital[cps$MARST == 6] <- "never"

# indicator for married
cps$married <- 0
cps$married[cps$marital == "married"] <- 1

## region recode
cps$reg <- NA
cps$reg[cps$REGION == 11 | cps$REGION == 12] <- "northeast"
cps$reg[cps$REGION == 21 | cps$REGION == 22] <- "midwest"
cps$reg[cps$REGION == 31 | cps$REGION == 32 | cps$REGION == 33] <- "south"
cps$reg[cps$REGION == 41 | cps$REGION == 42] <- "west"

# NE indicator
cps$northeast <- 0
cps$northeast[cps$reg == "northeast"] <- 1

# MW indicator
cps$midwest <- 0
cps$midwest[cps$reg == "midwest"] <- 1

# S indicator
cps$south <- 0
cps$south[cps$reg == "south"] <- 1

# W indicator
cps$west <- 0
cps$west[cps$reg == "west"] <- 1

## voter registration / turnout
## NOTE: Missing values are coded as NA
## Don't know/refused/no response coded as No to be consistent with Berinkey et al. (2012)
cps$registered <- 0
cps$registered[cps$VOTEREG == 99] <- NA
cps$registered[cps$VOTEREG == 1 | cps$VOTEREG == 2] <- 1

## NOTE: Missing values are coded as NA
## Don't know/refused/no response coded as No to be consistent with Berinkey et al. (2012)
cps$voted <- 0
cps$voted[cps$VOTEREG == 99] <- NA
cps$voted[cps$VOTEREG == 2] <- 1

## NOTE: When calculating means and standard deviations for the CPS the person-level weights included 
## with the data are used 

cps_means <- rep(NA,25)
cps_ses <- rep(NA,25)

## CPS female
cps_means[1] <- round(wtd.mean(cps$female, cps$WTFINL)*100,1)
cps_ses[1] <- round(sqrt(wtd.var(cps$female, cps$WTFINL))/sqrt(length(cps$female))*100,1)

## CPS years education
cps_means[2] <- round(wtd.mean(cps$edu.years, cps$WTFINL),1)
cps_ses[2] <- round(sqrt(wtd.var(cps$edu.years, cps$WTFINL))/sqrt(length(cps$edu.years)),1)

## CPS age 
cps_means[3] <- round(wtd.mean(cps$age.years, cps$WTFINL),1)
cps_ses[3] <- round(sqrt(wtd.var(cps$age.years, cps$WTFINL))/sqrt(length(cps$age.years)),1)

## CPS income 
cps_means[4] <- round(wtd.mean(cps$income, cps$HWTFINL),0)
cps_ses[4] <- round(sqrt(wtd.var(cps$income, cps$HWTFINL))/sqrt(length(cps$income)),0)

cps_means[5] <- median(cps$income)
cps_ses[5] <- ""

## CPS race 
## white
cps_means[6] <- round(wtd.mean(cps$white, cps$WTFINL)*100,1)
cps_ses[6] <- round(sqrt(wtd.var(cps$white, cps$WTFINL))/sqrt(length(cps$white))*100,1)

## black
cps_means[7] <- round(wtd.mean(cps$black, cps$WTFINL)*100,1)
cps_ses[7] <- round(sqrt(wtd.var(cps$black, cps$WTFINL))/sqrt(length(cps$black))*100,1)

## hispanic
cps_means[8] <- round(wtd.mean(cps$hispanic, cps$WTFINL)*100,1)
cps_ses[8] <- round(sqrt(wtd.var(cps$hispanic, cps$WTFINL))/sqrt(length(cps$hispanic))*100,1)

## CPS marital status
## married
cps_means[9] <- round(wtd.mean(cps$married, cps$WTFINL)*100,1)
cps_ses[9] <- round(sqrt(wtd.var(cps$married, cps$WTFINL))/sqrt(length(cps$married))*100,1)

## CPS region
## northeast
cps_means[14] <- round(wtd.mean(cps$northeast, cps$HWTFINL)*100,1)
cps_ses[14] <- round(sqrt(wtd.var(cps$northeast, cps$HWTFINL))/sqrt(length(cps$northeast))*100,1)

## midwest
cps_means[15] <- round(wtd.mean(cps$midwest, cps$HWTFINL)*100,1)
cps_ses[15] <- round(sqrt(wtd.var(cps$midwest, cps$HWTFINL))/sqrt(length(cps$midwest))*100,1)

## south
cps_means[16] <- round(wtd.mean(cps$south, cps$HWTFINL)*100,1)
cps_ses[16] <- round(sqrt(wtd.var(cps$south, cps$HWTFINL))/sqrt(length(cps$south))*100,1)

## west
cps_means[17] <- round(wtd.mean(cps$west, cps$HWTFINL)*100,1)
cps_ses[17] <- round(sqrt(wtd.var(cps$west, cps$HWTFINL))/sqrt(length(cps$west))*100,1)

## CPS voter turnout and registration
## registration
cps_means[23] <- round(wtd.mean(cps$registered, cps$WTFINL)*100,1)
cps_ses[23] <- round(sqrt(wtd.var(cps$registered, cps$WTFINL))/sqrt(length(cps$registered))*100,1)

## turnout 
cps_means[24] <- round(wtd.mean(cps$voted, cps$WTFINL)*100,1)
cps_ses[24] <- round(sqrt(wtd.var(cps$voted, cps$WTFINL))/sqrt(length(cps$voted))*100,1)

# put means and SEs together nicely for table column
cps_msd <- paste(cps_means," (",cps_ses,")",sep="")

# include asterisk for voted in 2012
cps_msd[24] <- paste(cps_msd[24],"*",sep="")

# add N row
cps_msd[25] <- "92,311-102,011"

# make sure blank rows are blank instead of NA for table
cps_msd[c(10:13,18:22)] <- ""

## Column 5: American National Election Study 2012 demographic data ##

## first subset to face-to-face respondents only consistent with Berinsky et al. (2012)
anes12ftf <- anes12[anes12$MODE == "(1) 1. FTF (face-to-face) mode",]

# remove version that includes non-face-to-face responses
rm(anes12)

# empty vectors for means and SEs
anes_means <- rep(NA, 25)
anes_ses <- rep(NA, 25)

## ANES female (1 = male 2 = female)

## NOTE: code to convert factor variables to numeric provided by ICPSR
## change from factor to numeric
lbls <- sort(levels(anes12ftf$GENDER_RESPONDENT_X))
lbls <- (sub("^\\([0-9]+\\) +(.+$)", "\\1", lbls))
anes12ftf$GENDER_RESPONDENT_X <- as.numeric(sub("^\\(0*([0-9]+)\\).+$", "\\1", anes12ftf$GENDER_RESPONDENT_X))
anes12ftf$GENDER_RESPONDENT_X <- add.value.labels(anes12ftf$GENDER_RESPONDENT_X, lbls)

# gender
anes_means[1] <- round(wtd.mean(anes12ftf$GENDER_RESPONDENT == 2, weights = anes12ftf$WEIGHT_FTF)*100,1)
anes_ses[1] <- round(sqrt(wtd.var(anes12ftf$GENDER_RESPONDENT == 2, anes12ftf$WEIGHT_FTF))/sqrt(length(na.omit(anes12ftf$GENDER_RESPONDENT)))*100,1)

## ANES years of education recode to numeric
lbls <- sort(levels(anes12ftf$DEM_EDU))
lbls <- (sub("^\\([0-9]+\\) +(.+$)", "\\1", lbls))
anes12ftf$DEM_EDU <- as.numeric(sub("^\\(0*([0-9]+)\\).+$", "\\1", anes12ftf$DEM_EDU))
anes12ftf$DEM_EDU <- add.value.labels(anes12ftf$DEM_EDU, lbls)

## recode highest level or school or highest degree to number of years of
## education based on average to achieve degree
anes12ftf$edu.years <- 0
anes12ftf$edu.years[anes12ftf$DEM_EDU == 1] <- 0
anes12ftf$edu.years[anes12ftf$DEM_EDU == 2] <- 4
anes12ftf$edu.years[anes12ftf$DEM_EDU == 3] <- 6
anes12ftf$edu.years[anes12ftf$DEM_EDU == 4] <- 8
anes12ftf$edu.years[anes12ftf$DEM_EDU == 5] <- 9
anes12ftf$edu.years[anes12ftf$DEM_EDU == 6] <- 10
anes12ftf$edu.years[anes12ftf$DEM_EDU == 7] <- 11
anes12ftf$edu.years[anes12ftf$DEM_EDU == 8] <- 11.5
anes12ftf$edu.years[anes12ftf$DEM_EDU == 9] <- 12
anes12ftf$edu.years[anes12ftf$DEM_EDU == 10] <- 13
anes12ftf$edu.years[anes12ftf$DEM_EDU == 11] <- 14
anes12ftf$edu.years[anes12ftf$DEM_EDU == 12] <-14
anes12ftf$edu.years[anes12ftf$DEM_EDU == 13] <- 16
anes12ftf$edu.years[anes12ftf$DEM_EDU == 14] <- 18
anes12ftf$edu.years[anes12ftf$DEM_EDU == 15] <- 20
anes12ftf$edu.years[anes12ftf$DEM_EDU == 16] <- 20
anes12ftf$edu.years[anes12ftf$DEM_EDU == 95] <- NA
anes12ftf$edu.years[anes12ftf$DEM_EDU == -9] <- NA

anes_means[2] <- round(wtd.mean(anes12ftf$edu.years, weights = anes12ftf$WEIGHT_FTF),1)
anes_ses[2] <- round(sqrt(wtd.var(anes12ftf$edu.years, anes12ftf$WEIGHT_FTF))/sqrt(length(na.omit(anes12ftf$edu.years))),1)

## ANES age
## convert to numeric
anes12ftf$DEM_AGE_R_X <- as.numeric(anes12ftf$DEM_AGE_R_X)

anes_means[3] <- round(wtd.mean(anes12ftf$DEM_AGE_R_X, weights = anes12ftf$WEIGHT_FTF),1)
anes_ses[3] <- round(sqrt(wtd.var(anes12ftf$DEM_AGE_R_X, anes12ftf$WEIGHT_FTF))/sqrt(length(na.omit(anes12ftf$DEM_AGE_R_X))),1)

## ANES income 
# convert to numeric
lbls <- sort(levels(anes12ftf$INCGROUP_PRE))
lbls <- (sub("^\\([0-9]+\\) +(.+$)", "\\1", lbls))
anes12ftf$INCGROUP_PRE <- as.numeric(sub("^\\(0*([0-9]+)\\).+$", "\\1", anes12ftf$INCGROUP_PRE))
anes12ftf$INCGROUP_PRE <- add.value.labels(anes12ftf$INCGROUP_PRE, lbls)

## recode income to median of range provided
anes12ftf$income <- NA
anes12ftf$income[anes12ftf$INCGROUP_PRE==1] <- 2500
anes12ftf$income[anes12ftf$INCGROUP_PRE==2] <- 7500
anes12ftf$income[anes12ftf$INCGROUP_PRE==3] <- 11250
anes12ftf$income[anes12ftf$INCGROUP_PRE==4] <- 13750
anes12ftf$income[anes12ftf$INCGROUP_PRE==5] <- 16250
anes12ftf$income[anes12ftf$INCGROUP_PRE==6] <- 18750
anes12ftf$income[anes12ftf$INCGROUP_PRE==7] <- 21250
anes12ftf$income[anes12ftf$INCGROUP_PRE==8] <- 23750
anes12ftf$income[anes12ftf$INCGROUP_PRE==9] <- 26250
anes12ftf$income[anes12ftf$INCGROUP_PRE==10] <- 28750
anes12ftf$income[anes12ftf$INCGROUP_PRE==11] <- 32500
anes12ftf$income[anes12ftf$INCGROUP_PRE==12] <- 37500
anes12ftf$income[anes12ftf$INCGROUP_PRE==13] <- 42500
anes12ftf$income[anes12ftf$INCGROUP_PRE==14] <- 47500
anes12ftf$income[anes12ftf$INCGROUP_PRE==15] <- 52500
anes12ftf$income[anes12ftf$INCGROUP_PRE==16] <- 57500
anes12ftf$income[anes12ftf$INCGROUP_PRE==17] <- 62500
anes12ftf$income[anes12ftf$INCGROUP_PRE==18] <- 67500
anes12ftf$income[anes12ftf$INCGROUP_PRE==19] <- 72500
anes12ftf$income[anes12ftf$INCGROUP_PRE==20] <- 77500
anes12ftf$income[anes12ftf$INCGROUP_PRE==21] <- 85000
anes12ftf$income[anes12ftf$INCGROUP_PRE==22] <- 95000
anes12ftf$income[anes12ftf$INCGROUP_PRE==23] <- 105000
anes12ftf$income[anes12ftf$INCGROUP_PRE==24] <- 117500
anes12ftf$income[anes12ftf$INCGROUP_PRE==25] <- 137500
anes12ftf$income[anes12ftf$INCGROUP_PRE==26] <- 162500
anes12ftf$income[anes12ftf$INCGROUP_PRE==27] <- 212500
anes12ftf$income[anes12ftf$INCGROUP_PRE==28] <- 250000

# mean income
anes_means[4] <- round(wtd.mean(anes12ftf$income, weights = anes12ftf$WEIGHT_FTF, na.rm = T),0)
anes_ses[4] <- round(sqrt(wtd.var(anes12ftf$income, anes12ftf$WEIGHT_FTF))/sqrt(length(na.omit(anes12ftf$income))),0)

# median income
anes_means[5] <- median(anes12ftf$income, na.rm = T)
anes_ses[5] <- ""

## ANES race

## change race values to numeric codes
lbls <- sort(levels(anes12ftf$DEM_RACECPS_1ST))
lbls <- (sub("^\\([0-9]+\\) +(.+$)", "\\1", lbls))
anes12ftf$DEM_RACECPS_1ST <- as.numeric(sub("^\\(0*([0-9]+)\\).+$", "\\1", anes12ftf$DEM_RACECPS_1ST))
anes12ftf$DEM_RACECPS_1ST <- add.value.labels(anes12ftf$DEM_RACECPS_1ST, lbls)

## change I don't know and refused to NA
anes12ftf$DEM_RACECPS_1ST[anes12ftf$DEM_RACECPS_1ST == -8] <- NA
anes12ftf$DEM_RACECPS_1ST[anes12ftf$DEM_RACECPS_1ST == -9] <- NA

## change hispanic indicator to numeric code
lbls <- sort(levels(anes12ftf$DEM_HISP))
lbls <- (sub("^\\([0-9]+\\) +(.+$)", "\\1", lbls))
anes12ftf$DEM_HISP <- as.numeric(sub("^\\(0*([0-9]+)\\).+$", "\\1", anes12ftf$DEM_HISP))
anes12ftf$DEM_HISP <- add.value.labels(anes12ftf$DEM_HISP, lbls)

## if indicated hispanic, change race to hispanic in race variable
anes12ftf$DEM_RACECPS_1ST[anes12ftf$DEM_HISP == 1] <- 6


## white, non-hispanic
anes_means[6] <- round(wtd.mean(anes12ftf$DEM_RACECPS_1ST == 1, weights = anes12ftf$WEIGHT_FTF)*100,1)
anes_ses[6] <- round(sqrt(wtd.var(anes12ftf$DEM_RACECPS_1ST == 1, anes12ftf$WEIGHT_FTF))/sqrt(length(na.omit(anes12ftf$DEM_RACECPS_1ST)))*100,1)

## black, non-hispanic
anes_means[7] <- round(wtd.mean(anes12ftf$DEM_RACECPS_1ST == 2, weights = anes12ftf$WEIGHT_FTF)*100,1)
anes_ses[7] <- round(sqrt(wtd.var(anes12ftf$DEM_RACECPS_1ST == 2, anes12ftf$WEIGHT_FTF))/sqrt(length(na.omit(anes12ftf$DEM_RACECPS_1ST)))*100,1)

## hispanic
anes_means[8] <- round(wtd.mean(anes12ftf$DEM_RACECPS_1ST == 6, weights = anes12ftf$WEIGHT_FTF)*100,1)
anes_ses[8] <- round(sqrt(wtd.var(anes12ftf$DEM_RACECPS_1ST == 6, anes12ftf$WEIGHT_FTF))/sqrt(length(na.omit(anes12ftf$DEM_RACECPS_1ST)))*100,1)

## ANES marital status
## convert to numeric
lbls <- sort(levels(anes12ftf$DEM_MARITAL))
lbls <- (sub("^\\([0-9]+\\) +(.+$)", "\\1", lbls))
anes12ftf$DEM_MARITAL <- as.numeric(sub("^\\(0*([0-9]+)\\).+$", "\\1", anes12ftf$DEM_MARITAL))
anes12ftf$DEM_MARITAL <- add.value.labels(anes12ftf$DEM_MARITAL, lbls)

## change "refused" to NA
anes12ftf$DEM_MARITAL[anes12ftf$DEM_MARITAL == -9] <- NA

## change "married: spouse absent" to "married"
anes12ftf$DEM_MARITAL[anes12ftf$DEM_MARITAL == 2] <- 1

## married
anes_means[9] <- round(wtd.mean(anes12ftf$DEM_MARITAL == 1, weights = anes12ftf$WEIGHT_FTF)*100,1)
anes_ses[9] <- round(sqrt(wtd.var(anes12ftf$DEM_MARITAL == 1, anes12ftf$WEIGHT_FTF))/sqrt(length(na.omit(anes12ftf$DEM_MARITAL)))*100,1)

## ANES home ownership
## convert to numeric code
lbls <- sort(levels(anes12ftf$DEM3_OWNHOME))
lbls <- (sub("^\\([0-9]+\\) +(.+$)", "\\1", lbls))
anes12ftf$DEM3_OWNHOME <- as.numeric(sub("^\\(0*([0-9]+)\\).+$", "\\1", anes12ftf$DEM3_OWNHOME))
anes12ftf$DEM3_OWNHOME <- add.value.labels(anes12ftf$DEM3_OWNHOME, lbls)

## convert refused, don't know, and other to NA
anes12ftf$DEM3_OWNHOME[anes12ftf$DEM3_OWNHOME == 5 | anes12ftf$DEM3_OWNHOME == -8 |
                         anes12ftf$DEM3_OWNHOME == -9] <- NA

## own home
anes_means[10] <- round(wtd.mean(anes12ftf$DEM3_OWNHOME == 1, weights = anes12ftf$WEIGHT_FTF)*100,1)
anes_ses[10] <- round(sqrt(wtd.var(anes12ftf$DEM3_OWNHOME == 1, anes12ftf$WEIGHT_FTF))/sqrt(length(na.omit(anes12ftf$DEM3_OWNHOME)))*100,1)

## ANES religion
## convert to numeric code
lbls <- sort(levels(anes12ftf$RELIG_7CAT_X))
lbls <- (sub("^\\([0-9]+\\) +(.+$)", "\\1", lbls))
anes12ftf$RELIG_7CAT_X <- as.numeric(sub("^\\(0*([0-9]+)\\).+$", "\\1", anes12ftf$RELIG_7CAT_X))
anes12ftf$RELIG_7CAT_X <- add.value.labels(anes12ftf$RELIG_7CAT_X, lbls)

## change evangelical protestant and black protestant to protestant
anes12ftf$RELIG_7CAT_X[anes12ftf$RELIG_7CAT_X == 2 | anes12ftf$RELIG_7CAT_X == 3] <- 1

## change undifferentiated christian to other
anes12ftf$RELIG_7CAT_X[anes12ftf$RELIG_7CAT_X == 5] <- 7

## none
anes_means[11] <- round(wtd.mean(anes12ftf$RELIG_7CAT_X == 8, weights = anes12ftf$WEIGHT_FTF)*100,1)
anes_ses[11] <- round(sqrt(wtd.var(anes12ftf$RELIG_7CAT_X == 8, anes12ftf$WEIGHT_FTF))/sqrt(length(na.omit(anes12ftf$RELIG_7CAT_X)))*100,1)

## protestant
anes_means[12] <- round(wtd.mean(anes12ftf$RELIG_7CAT_X == 1, weights = anes12ftf$WEIGHT_FTF)*100,1)
anes_ses[12] <- round(sqrt(wtd.var(anes12ftf$RELIG_7CAT_X == 1, anes12ftf$WEIGHT_FTF))/sqrt(length(na.omit(anes12ftf$RELIG_7CAT_X)))*100,1)

## catholic
anes_means[13] <- round(wtd.mean(anes12ftf$RELIG_7CAT_X == 4, weights = anes12ftf$WEIGHT_FTF)*100,1)
anes_ses[13] <- round(sqrt(wtd.var(anes12ftf$RELIG_7CAT_X == 4, anes12ftf$WEIGHT_FTF))/sqrt(length(na.omit(anes12ftf$RELIG_7CAT_X)))*100,1)

## ANES region
## ANES code states as region
anes12ftf$region <- NA
anes12ftf$region[anes12ftf$SAMPLE_STATE == "ME" | anes12ftf$SAMPLE_STATE == "NH" | 
                   anes12ftf$SAMPLE_STATE == "VT" | anes12ftf$SAMPLE_STATE == "MA" |
                   anes12ftf$SAMPLE_STATE == "RI" | anes12ftf$SAMPLE_STATE == "CT" | 
                   anes12ftf$SAMPLE_STATE == "NY" | anes12ftf$SAMPLE_STATE == "NJ" |
                   anes12ftf$SAMPLE_STATE == "PA"] <- "northeast"
anes12ftf$region[anes12ftf$SAMPLE_STATE == "DE" | anes12ftf$SAMPLE_STATE == "MD" |
                   anes12ftf$SAMPLE_STATE == "WV" | anes12ftf$SAMPLE_STATE == "VA" | 
                   anes12ftf$SAMPLE_STATE == "KY" | anes12ftf$SAMPLE_STATE == "TN"
                 | anes12ftf$SAMPLE_STATE == "NC" | anes12ftf$SAMPLE_STATE =="SC"
                 | anes12ftf$SAMPLE_STATE == "GA" | anes12ftf$SAMPLE_STATE == "FL"
                 | anes12ftf$SAMPLE_STATE =="AL" | anes12ftf$SAMPLE_STATE =="MS"
                 | anes12ftf$SAMPLE_STATE =="LA" | anes12ftf$SAMPLE_STATE =="AR"
                 | anes12ftf$SAMPLE_STATE == "OK" | anes12ftf$SAMPLE_STATE =="TX"
                 | anes12ftf$SAMPLE_STATE == "DC"] <- "south"
anes12ftf$region[anes12ftf$SAMPLE_STATE =="OH" | anes12ftf$SAMPLE_STATE =="MI" 
                 | anes12ftf$SAMPLE_STATE =="IN" | anes12ftf$SAMPLE_STATE =="IL"
                 | anes12ftf$SAMPLE_STATE =="WI"| anes12ftf$SAMPLE_STATE =="MO"
                 | anes12ftf$SAMPLE_STATE =="IA" | anes12ftf$SAMPLE_STATE =="MN"
                 | anes12ftf$SAMPLE_STATE =="KS" | anes12ftf$SAMPLE_STATE =="NE"
                 | anes12ftf$SAMPLE_STATE =="SD"| anes12ftf$SAMPLE_STATE =="ND"] <- "midwest"
anes12ftf$region[anes12ftf$SAMPLE_STATE =="NM"| anes12ftf$SAMPLE_STATE =="CO"
                 | anes12ftf$SAMPLE_STATE =="WY"| anes12ftf$SAMPLE_STATE =="MT"
                 | anes12ftf$SAMPLE_STATE =="AZ"| anes12ftf$SAMPLE_STATE =="UT"
                 | anes12ftf$SAMPLE_STATE =="ID"| anes12ftf$SAMPLE_STATE =="CA"
                 | anes12ftf$SAMPLE_STATE =="NV"| anes12ftf$SAMPLE_STATE =="OR"
                 | anes12ftf$SAMPLE_STATE =="WA" | anes12ftf$SAMPLE_STATE =="HI"
                 | anes12ftf$SAMPLE_STATE =="AK"] <- "west"

# Northeast
anes_means[14] <- round(wtd.mean(anes12ftf$region == "northeast", weights = anes12ftf$WEIGHT_FTF)*100,1)
anes_ses[14] <- round(sqrt(wtd.var(anes12ftf$region == "northeast", anes12ftf$WEIGHT_FTF))/sqrt(length(na.omit(anes12ftf$region)))*100,1)

# midwest
anes_means[15] <- round(wtd.mean(anes12ftf$region == "midwest", weights = anes12ftf$WEIGHT_FTF)*100,1)
anes_ses[15] <- round(sqrt(wtd.var(anes12ftf$region == "midwest", anes12ftf$WEIGHT_FTF))/sqrt(length(na.omit(anes12ftf$region)))*100,1)

# south
anes_means[16] <- round(wtd.mean(anes12ftf$region == "south", weights = anes12ftf$WEIGHT_FTF)*100,1)
anes_ses[16] <- round(sqrt(wtd.var(anes12ftf$region == "south", anes12ftf$WEIGHT_FTF))/sqrt(length(na.omit(anes12ftf$region)))*100,1)

# west
anes_means[17] <- round(wtd.mean(anes12ftf$region == "west", weights = anes12ftf$WEIGHT_FTF)*100,1)
anes_ses[17] <- round(sqrt(wtd.var(anes12ftf$region == "west", anes12ftf$WEIGHT_FTF))/sqrt(length(na.omit(anes12ftf$region)))*100,1)

## ANES voting behavior
## convert to numeric code
lbls <- sort(levels(anes12ftf$PREVOTE_REGIST_ADDR))
lbls <- (sub("^\\([0-9]+\\) +(.+$)", "\\1", lbls))
anes12ftf$PREVOTE_REGIST_ADDR <- as.numeric(sub("^\\(0*([0-9]+)\\).+$", "\\1", anes12ftf$PREVOTE_REGIST_ADDR))
anes12ftf$PREVOTE_REGIST_ADDR <- add.value.labels(anes12ftf$PREVOTE_REGIST_ADDR, lbls)

## registered
anes_means[23] <- round(wtd.mean(anes12ftf$PREVOTE_REGIST_ADDR == 1, weights = anes12ftf$WEIGHT_FTF)*100,1)
anes_ses[23] <- round(sqrt(wtd.var(anes12ftf$PREVOTE_REGIST_ADDR == 1, anes12ftf$WEIGHT_FTF))/sqrt(length(na.omit(anes12ftf$PREVOTE_REGIST_ADDR)))*100,1)

## convert turnout 2008 to numeric code
lbls <- sort(levels(anes12ftf$INTEREST_VOTED2008))
lbls <- (sub("^\\([0-9]+\\) +(.+$)", "\\1", lbls))
anes12ftf$INTEREST_VOTED2008 <- as.numeric(sub("^\\(0*([0-9]+)\\).+$", "\\1", anes12ftf$INTEREST_VOTED2008))
anes12ftf$INTEREST_VOTED2008 <- add.value.labels(anes12ftf$INTEREST_VOTED2008, lbls)

## ANES turnout 2008
anes_means[24] <- round(wtd.mean(anes12ftf$INTEREST_VOTED2008 == 1, weights = anes12ftf$WEIGHT_FTF)*100,1)
anes_ses[24] <- round(sqrt(wtd.var(anes12ftf$INTEREST_VOTED2008 == 1, anes12ftf$WEIGHT_FTF))/sqrt(length(na.omit(anes12ftf$INTEREST_VOTED2008)))*100,1)

# put means and SEs together nicely for table column
anes_msd <- paste(anes_means," (",anes_ses,")",sep="")

# add N row
anes_msd[25] <- "2,004-2,054"

# make sure blank rows are blank instead of NA for table
anes_msd[18:22] <- ""

#########################################

# now combine all columns with means and SEs
all_msd <- cbind(dl_msd, mt_msd, anesp_msd, cps_msd, anes_msd)

# label columns for table
cols <- c("DLABSS", "MTurk", "ANESP", "CPS 2012", "ANES 2012")

# label rows for table
rows <- c("Female", "Education (years)", "Age (years)", "Mean income", "Median income", "White", "Black",
          "Hispanic", "Married", "Own home", "None", "Protestant", "Catholic", "Northeast", "Midwest",
          "South", "West", "Democrat", "Independent/Other", "Republican", "Liberal", "Conservative",
          "Registered", "Voted 2008", "N")

# combine all together into matrix for xtable
Tab1 <- matrix(all_msd,25,5,dimnames=list(rows,cols))

# use xtable to produce latex code for table
# latex code is output as a file to the working directory
print(xtable(Tab1, align = c("l","c","c","c","c","c")), file = getOption("Table1.file", "Table1"))

#########################
## End Table 1 ##########
#########################

##########################
## Figure 2, Main paper ##
##########################

data <- read.csv("datasets/response_quality_data.csv")

##########CRITERION 1: Straightlining (binary measurement- uniform response category or not, 
# proportion of 7 questions on which r straightlined)

################ Straightlining data processing: 

# Each for-loop below reads a straightlining block as a vector. Where variable names or block lengths 
# are different across survey versions, the loop accounts for this. The loop then check how many unique
# responses were offered for that straightlining block, and codes a 1 for "straightlined" a "0" for 
# did not straightline (more than 1 unique value aside from NA in the vector), or a 
# "skipped" if all responses for that block are NA. 

#sl_1  both surveys Long items, Inconsistent expectation (5 items)
data$straightliner_1 <- rep(NA, nrow(data))
for(i in 1:nrow(data)){
  r1 <- data$sl_LI_1_1[i]
  r2 <- data$sl_LI_1_2[i]
  r3 <- data$sl_LI_1_3[i]
  r4 <- data$sl_LI_1_4[i]
  r5 <- data$sl_LI_1_5[i]
  
  vals <- as.vector(na.omit(unique(c(r1, r2, r3, r4, r5))))
  l<- length(vals)
  if(l > 1){ data$straightliner_1[i] <- 0} 
  if(l==1){data$straightliner_1[i] <- 1 }  
  if(l==0) {data$straightliner_1[i] <- "skipped" } 
  #print(i)
  i+1
}


#sl_2   both surveys Long items, Inconsistent expectation (5 items FEP, 6 items Secular)
data$straightliner_2 <- rep(NA, nrow(data))
for(i in 1:nrow(data)){
  r1 <- data$sl_LI_2_1[i]
  r2 <- data$sl_LI_2_2[i]
  r3 <- data$sl_LI_2_3[i]
  r4 <- data$sl_LI_2_4[i]
  r5 <- data$sl_LI_2_5[i]
  r6 <- data$sl_LI_2_6[i]
  
  vals <- as.vector(na.omit(unique(c(r1, r2, r3, r4, r5, r6))))
  l <- length(vals)
  if(l > 1){ data$straightliner_2[i] <- 0}
  if(l==1){data$straightliner_2[i] <- 1 }  
  if(l==0) {data$straightliner_2[i] <- "skipped" } 
  #print(i)
  i+1
}

#sl_3   both surveys short items, consistent expectation (3 items FEP, 4 items Secular)
data$straightliner_3 <- rep(NA, nrow(data))
for(i in 1:nrow(data)){
  r1 <- data$sl_SC_3_1[i]
  r2 <- data$sl_SC_3_2[i]
  r3 <- data$sl_SC_3_3[i]
  r4 <- data$sl_SC_3_4[i]
  
  vals <- as.vector(na.omit(unique(c(r1, r2, r3, r4))))
  l <- length(vals)
  if(l > 1){ data$straightliner_3[i] <- 0}
  if(l==1){data$straightliner_3[i] <- 1 }  
  if(l==0) {data$straightliner_3[i] <- "skipped" } 
  #print(i)
  i+1
}

#sl_4 FEP Long, inconsistent (6 items)
#sl_4 Secular Short, consistent (6 items)
data$straightliner_4 <- rep(NA, nrow(data))
for(i in 1:nrow(data)){
  if(is.na(data$sl_SC_4_1[i])){
    r1 <- data$sl_LI_4_1[i]
    r2 <- data$sl_LI_4_2[i]
    r3 <- data$sl_LI_4_3[i]
    r4 <- data$sl_LI_4_4[i]
    r5 <- data$sl_LI_4_5[i]
    r6 <- data$sl_LI_4_6[i]
  }else{
    r1 <- data$sl_SC_4_1[i]
    r2 <- data$sl_SC_4_2[i]
    r3 <- data$sl_SC_4_3[i]
    r4 <- data$sl_SC_4_4[i]
    r5 <- data$sl_SC_4_5[i]
    r6 <- data$sl_SC_4_6[i]
  }
  
  vals <- as.vector(na.omit(unique(c(r1, r2, r3, r4, r5, r6))))
  l <- length(vals)
  if(l > 1){ data$straightliner_4[i] <- 0}
  if(l==1){data$straightliner_4[i] <- 1 }  
  if(l==0) {data$straightliner_4[i] <- "skipped" } 
  #print(i)
  i+1
}

#sl_5 FEP short, inconsistent (6 items)
#sl_5 Secular Long, inconsistent (6 items)
data$straightliner_5 <- rep(NA, nrow(data))
for(i in 1:nrow(data)){
  if(is.na(data$sl_SI_5_1[i])){
    r1 <- data$sl_LI_5_1[i]
    r2 <- data$sl_LI_5_2[i]
    r3 <- data$sl_LI_5_3[i]
    r4 <- data$sl_LI_5_4[i]
    r5 <- data$sl_LI_5_5[i]
    r6 <- data$sl_LI_5_6[i]
  }else{
    r1 <- data$sl_SI_5_1[i]
    r2 <- data$sl_SI_5_2[i]
    r3 <- data$sl_SI_5_3[i]
    r4 <- data$sl_SI_5_4[i]
    r5 <- data$sl_SI_5_5[i]
    r6 <- data$sl_SI_5_6[i]
  }
  
  vals <- as.vector(na.omit(unique(c(r1, r2, r3, r4, r5, r6))))
  l <- length(vals)
  if(l > 1){ data$straightliner_5[i] <- 0}
  if(l==1){data$straightliner_5[i] <- 1 }  
  if(l==0) {data$straightliner_5[i] <- "skipped" } 
  #print(i)
  i+1
}

#sl_6 FEP short, inconsistent (6 items)
#sl_6 Secular short, consistent (6 items)
data$straightliner_6 <- rep(NA, nrow(data))
for(i in 1:nrow(data)){
  if(is.na(data$sl_SC_6_1[i])){
    r1 <- data$sl_SI_6_1[i]
    r2 <- data$sl_SI_6_2[i]
    r3 <- data$sl_SI_6_3[i]
    r4 <- data$sl_SI_6_4[i]
    r5 <- data$sl_SI_6_5[i]
    r6 <- data$sl_SI_6_6[i]
  }else{
    r1 <- data$sl_SC_6_1[i]
    r2 <- data$sl_SC_6_2[i]
    r3 <- data$sl_SC_6_3[i]
    r4 <- data$sl_SC_6_4[i]
    r5 <- data$sl_SC_6_5[i]
    r6 <- data$sl_SC_6_6[i]
  }
  
  vals <- as.vector(na.omit(unique(c(r1, r2, r3, r4, r5, r6))))
  l <- length(vals)
  if(l > 1){ data$straightliner_6[i] <- 0}
  if(l==1){data$straightliner_6[i] <- 1 }  
  if(l==0) {data$straightliner_6[i] <- "skipped" } 
  #print(i)
  i+1
}


#sl_7 FEP Long, inconsistent (8 items)
#sl_7 Secular short, inconsistent (9 items)
data$straightliner_7 <- rep(NA, nrow(data))
for(i in 1:nrow(data)){
  if(is.na(data$sl_SI_7_1[i])){
    r1 <- data$sl_LI_7_1[i]
    r2 <- data$sl_LI_7_2[i]
    r3 <- data$sl_LI_7_3[i]
    r4 <- data$sl_LI_7_4[i]
    r5 <- data$sl_LI_7_5[i]
    r6 <- data$sl_LI_7_6[i]
    r7 <- data$sl_LI_7_7[i]
    r8 <- data$sl_LI_7_8[i]
    r9 <- NA
  }else{
    r1 <- data$sl_SI_7_1[i]
    r2 <- data$sl_SI_7_2[i]
    r3 <- data$sl_SI_7_3[i]
    r4 <- data$sl_SI_7_4[i]
    r5 <- data$sl_SI_7_5[i]
    r6 <- data$sl_SI_7_6[i]
    r7 <- data$sl_SI_7_7[i]
    r8 <- data$sl_SI_7_8[i]
    r9 <- data$sl_SI_7_9[i]
  }
  
  vals <- as.vector(na.omit(unique(c(r1, r2, r3, r4, r5, r6, r7, r8, r9))))
  l <- length(vals)
  if(l > 1){ data$straightliner_7[i] <- 0}
  if(l==1){data$straightliner_7[i] <- 1 }  
  if(l==0) {data$straightliner_7[i] <- "skipped" } 
  #print(i)
  i+1
}

####SL processing: 

# This gets the number of blocks skipped from the total number of straightlining blocks, 
# which can be used to determine "complete quitters". "Complete quitters" have 7 skips, one 
# for each SL block. 
data$sl_skips <- rep(NA, nrow(data))
for(i in 1:nrow(data)){
  if(data$straightliner_1[i] == "skipped"){data$sl_skips[i] <- 1}else {data$sl_skips[i] <- 0}
  if(data$straightliner_2[i] =="skipped"){data$sl_skips[i] <- data$sl_skips[i] + 1}  
  if(data$straightliner_3[i] =="skipped"){data$sl_skips[i] <- data$sl_skips[i] + 1}  
  if(data$straightliner_4[i] =="skipped"){data$sl_skips[i] <- data$sl_skips[i] + 1}  
  if(data$straightliner_5[i] =="skipped"){data$sl_skips[i] <- data$sl_skips[i] + 1}  
  if(data$straightliner_6[i] =="skipped"){data$sl_skips[i] <- data$sl_skips[i] + 1}  
  if(data$straightliner_7[i] =="skipped"){data$sl_skips[i] <- data$sl_skips[i] + 1}  
}

table(data$sl_skips, data$volunteer)

########################################################################
# Processing SL blocks 1,2, & 7separately in order to use inconsistent-expectation blocks only in 
# responses for regression analysis.

# Total number of blocks skipped of three inconsistent blocks:
data$sl_skips_incon <- rep(NA, nrow(data))
for(i in 1:nrow(data)){
  if(data$straightliner_1[i] == "skipped"){data$sl_skips_incon[i] <- 1}else {data$sl_skips_incon[i] <- 0}
  if(data$straightliner_2[i] =="skipped"){data$sl_skips_incon[i] <- data$sl_skips_incon[i] + 1}  
  if(data$straightliner_7[i] =="skipped"){data$sl_skips_incon[i] <- data$sl_skips_incon[i] + 1}  
}

table(data$sl_skips_incon)

# This loop gets the total number of inconsistent blocks straightlined by respondent and divides 
# by the total blocks that respondent took. 

# NOTE: NAs in this loop are from converting characters to numeric and nothing to worry about
data$straightliner_incon <- rep(NA, nrow(data)) 
for(i in 1:nrow(data)){
  data$straightliner_incon[i] <- sum(na.omit(as.numeric(as.vector(c(data$straightliner_1[i], data$straightliner_2[i], data$straightliner_1[i]))))) / (3-data$sl_skips_incon[i])}

length(data$straightliner_incon)
length(which(is.na(data$straightliner_incon)))
################End straightlining data processing


#####################################################################
## Additional data processing of partial and complete "quitters" on the basis of SL data:

# The sl_quit variable includes any respondent who dropped out before finishing all 7 sl blocks, 
# but may have completed 1 or more blocks. 
data$sl_quit <- rep(0, nrow(data))
data$sl_quit[which(data$sl_skips > 0)] <- 1

# The complete_quit variable includes respondents who quit before taking any of the SL blocks, 
# ie, took the master survey and quit or entered their email and quit. 
data$complete_quit<- rep(0, nrow(data))
data$complete_quit [which(data$sl_skips ==7)] <- 1

# drop anyone who quit by end of sl section, even if they completed some SL questions
data<- data[-which(data$sl_quit ==1), ] 
dim(data)  

# dimensions of data should be 1861 X 223

# Straightlining Test 1 of 1 (proportion of 3 inconsistent sl questions on which respondent 
# straightlined

reg1i <- lm(straightliner_incon ~ volunteer, data[is.na(data$straightliner_incon)==F, ])
#summary(reg1i)

reg1i.f  <- lm(straightliner_incon ~ volunteer, data[data$survey==1 & is.na(data$straightliner_incon)==F, ])
#summary(reg1i.f)

reg1i.s  <- lm(straightliner_incon ~ volunteer, data[data$survey==0 & is.na(data$straightliner_incon)==F, ])
#summary(reg1i.s)

c2.reg1i <- lm(straightliner_incon~ volunteer + age.years + income + factor(race) + liberal + factor(partyID) + Education + survey , data[is.na(data$straightliner_incon)==F, ])
#summary(c2.reg1i)

c2.reg1i.f  <- lm(straightliner_incon ~ volunteer + age.years + income + factor(race) + liberal + factor(partyID) + Education, data[data$survey==1 & is.na(data$straightliner_incon)==F, ])
#summary(c2.reg1i.f)

c2.reg1i.s  <- lm(straightliner_incon ~ volunteer + age.years + income + factor(race) + liberal + factor(partyID) + services + prot + other_chr+ other_non + Education  , data[data$survey==0 & is.na(data$straightliner_incon)==F, ])
#summary(c2.reg1i.s)

########CRITERION 2: Time investment
# time spent reading article (in seconds) 
data$time_article_Page.Submit <- as.numeric(data$time_article_Page.Submit)
# checking for alternative codings of missing data
length(which(is.na(data$time_article_Page.Submit)))
length(which(data$time_article_Page.Submit == ""| data$time_article_Page.Submit == -99))

# remove extreme outliers in timing question
lower <- quantile(na.omit(data$time_article_Page.Submit), c(0.025))
upper <- quantile(na.omit(data$time_article_Page.Submit), c(0.975))
data2 <- data[data$time_article_Page.Submit > lower & data$time_article_Page.Submit < upper,]

table(is.na(data2$time_article_Page.Submit))
data2<- data2[is.na(data2$time_article_Page.Submit) ==F, ]
dim(data2) # 1712 x 223

## Time Investment Test 1 of 4 (Time Spent Reading Article)
# without controls
reg2 <- lm(time_article_Page.Submit ~  volunteer , data2) #combined
#summary(reg2)

reg2.f <- lm(time_article_Page.Submit ~  volunteer ,   data2[data2$survey==1,]) #fep survey only
#summary(reg2.f)

reg2.s <- lm(time_article_Page.Submit ~  volunteer ,  data= data2[data2$survey==0,]) #sec survey only
#summary(reg2.s)

#added controls: 
c2.reg2 <- lm(time_article_Page.Submit ~  volunteer + age.years + income + factor(race) + liberal + factor(partyID) + Education + survey,  data= data2) #combined
#summary(c2.reg2)

c2.reg2.f <- lm(time_article_Page.Submit ~  volunteer + age.years + income + factor(race) + liberal + factor(partyID) + Education ,  data= data2[data2$survey==1,]) #fep survey only
#summary(c2.reg2.f)

c2.reg2.s <- lm(time_article_Page.Submit ~  volunteer + age.years + income + factor(race) + liberal + factor(partyID) + services + prot + other_chr+ other_non + Education ,  data= data2[data2$survey==0,]) #sec survey only
#summary(c2.reg2.s)

#time spent on time investment page questions
data$ti_quest_time_Page.Submit <- as.numeric(data$ti_quest_time_Page.Submit)
length(which(is.na(data$time_quest_time_Page.Submit)))
length(which(data$time_quest_time_Page.Submit == ""| data$time_quest_time_Page.Submit == -99))

# remove extreme outliers in timing question
lower <- quantile(na.omit(data$ti_quest_time_Page.Submit), c(0.025))
upper <- quantile(na.omit(data$ti_quest_time_Page.Submit), c(0.975))
data2b <- data[data$ti_quest_time_Page.Submit > lower & data$ti_quest_time_Page.Submit < upper,]

length(data2b$ti_quest_time_Page.Submit)
table(is.na(data2b$ti_quest_time_Page.Submit))
data2b<- data2b[is.na(data2b$ti_quest_time_Page.Submit) ==F, ]

## Time Investment Test 2 of 4 (Time Spent Answering Questions About Article)
# without controls
reg3 <- lm(ti_quest_time_Page.Submit ~ volunteer, data2b)
#summary(reg3)

reg3.f <- lm(ti_quest_time_Page.Submit ~ volunteer, data2b[data2b$survey== 1, ])
#summary(reg3.f)

reg3.s  <- lm(ti_quest_time_Page.Submit ~ volunteer, data2b[data2b$survey== 0, ])
#summary(reg3.s)

#added controls: 
c2.reg3 <- lm(ti_quest_time_Page.Submit ~  volunteer + age.years + income + factor(race) + liberal + factor(partyID) + Education + survey ,  data= data2b) #combined
#summary(c2.reg3)

c2.reg3.f <- lm(ti_quest_time_Page.Submit ~  volunteer  + age.years + income +  factor(race) + liberal + factor(partyID) + Education ,  data= data2b[data2b$survey==1,]) #fep survey only
#summary(c2.reg3.f)

c2.reg3.s <- lm(ti_quest_time_Page.Submit ~  volunteer + age.years + income +  factor(race) + liberal + factor(partyID) + services + prot + other_chr+ other_non + Education ,  data= data2b[data2b$survey==0,]) #sec survey only
#summary(c2.reg3.s) 

#Time Spent Responding to open-ended questions
data$oe1_time_Page.Submit <- as.numeric(data$oe1_time_Page.Submit)
length(which(is.na(data$oe1_time_Page.Submit)))

# remove extreme outliers in timing question
lower <- quantile(na.omit(data$oe1_time_Page.Submit), c(0.025))
upper <- quantile(na.omit(data$oe1_time_Page.Submit), c(0.975))
data3 <- data[data$oe1_time_Page.Submit > lower & data$oe1_time_Page.Submit < upper,]

length(which(is.na(data3$oe1_time_Page.Submit)))
length(which(data3$oe1_time_Page.Submit == "" | which(data3$oe1_time_Page.Submit == -99) ))
data3 <- data3[!is.na(data3$oe1_time_Page.Submit), ]

## Time Investment Test 3 of 4 (Time Spent Responding to open-ended questions, Question 1) 
# without controls:
reg4 <- lm(oe1_time_Page.Submit ~ volunteer, data3)
#summary(reg4)

reg4.f <- lm(oe1_time_Page.Submit ~ volunteer, data3[data3$survey==1, ])
#summary(reg4.f)

reg4.s <- lm(oe1_time_Page.Submit ~ volunteer, data3[data3$survey==0, ])
#summary(reg4.s)

#added controls:

c2.reg4 <- lm(oe1_time_Page.Submit ~ volunteer +  age.years + income +  factor(race) + liberal + factor(partyID) + Education + survey  , data3)
#summary(c2.reg4)

c2.reg4.f <- lm(oe1_time_Page.Submit ~ volunteer +  age.years + income +  factor(race) + liberal + factor(partyID) + Education  , data3[data3$survey==1, ])
#summary(c2.reg4.f)

c2.reg4.s <- lm(oe1_time_Page.Submit ~ volunteer + age.years + income +  factor(race) + liberal + factor(partyID) + services + prot + other_chr + other_non + Education , data3[data3$survey==0, ])
#summary(c2.reg4.s)

data$oe2_time_Page.Submit <- as.numeric(data$oe2_time_Page.Submit)

length(which(is.na(data$oe2_time_Page.Submit)))
length(which(data$oe2_time_Page.Submit == "" | which(data$oe2_time_Page.Submit == -99)))

# remove extreme outliers in timing question
lower <- quantile(na.omit(data$oe2_time_Page.Submit), c(0.025))
upper <- quantile(na.omit(data$oe2_time_Page.Submit), c(0.975))
data4 <- data[data$oe2_time_Page.Submit > lower & data$oe2_time_Page.Submit < upper,]

data4 <- data[is.na(data$oe2_time_Page.Submit) ==F, ]

## Time Investment Test 4 of 4 (Time Spent Responding to open-ended questions, Question 2) 
#without controls
reg5 <- lm(oe2_time_Page.Submit ~ volunteer, data4)
#summary(reg5)

reg5.f <- lm(oe2_time_Page.Submit ~ volunteer, data4[data4$survey==1, ])
#summary(reg5.f)

reg5.s <- lm(oe2_time_Page.Submit ~ volunteer, data4[data4$survey==0, ])
#summary(reg5.s)

#added controls
c2.reg5 <- lm(oe2_time_Page.Submit ~ volunteer + age.years + income +  factor(race) + liberal + factor(partyID) + Education + survey, data4)
#summary(c2.reg5)

c2.reg5.f <- lm(oe2_time_Page.Submit ~ volunteer + age.years + income +  factor(race) + liberal + factor(partyID) + Education, data4[data4$survey==1, ])
#summary(c2.reg5.f)

c2.reg5.s <- lm(oe2_time_Page.Submit ~ volunteer +  age.years + income +  factor(race) + liberal + factor(partyID) + services + prot + other_chr + other_non + Education, data4[data4$survey==0, ])
#summary(c2.reg5.s)

data$oe_time_avg <- (as.numeric(data$oe1_time_Page.Submit) + as.numeric(data$oe2_time_Page.Submit))/2
# remove extreme outliers 
lower <- quantile(na.omit(data$oe_time_avg), c(0.025))
upper <- quantile(na.omit(data$oe_time_avg), c(0.975))
data_avg <- data[data$oe_time_avg > lower & data$oe_time_avg < upper, ]

length(which(is.na(data_avg$oe_time_avg)))

c2.reg13 <- lm(oe_time_avg ~ volunteer + age.years + income +  factor(race) + liberal + factor(partyID) + Education + survey, data_avg[-which(is.na(data_avg$oe_time_avg)),])
#summary(c2.reg13)

reg13 <- lm(oe_time_avg ~ volunteer, data_avg[-which(is.na(data_avg$oe_time_avg)),])
#summary(reg13)

########## ATTENTION CHECK: 
## attention (secular only)
# DLABSS
mean(grepl("yes", tolower(data$ti_5[data$survey==0 & data$volunteer==1])))
# Mturk
mean(grepl("yes", tolower(data$ti_5[data$survey==0 & data$volunteer==0])))

length(which(is.na(data$ti_5)))
length(which(data$ti_5 == "" & data$survey == 0))
length(which(data$ti_5 == -99 & data$survey == 0 ))

data$check <- rep(NA, nrow(data)) 
data$check[grepl("yes", tolower(data$ti_5))== TRUE & data$survey==0   ] <- 1
data$check[grepl("yes", tolower(data$ti_5))== FALSE & data$survey==0  ] <- 0
data$check[is.na(data$ti_4)] <- NA

table(data$check, data$volunteer)  

c.att.out <- lm(check ~ volunteer + age.years + income +  factor(race) + liberal + factor(partyID) + services + prot + other_chr + other_non + Education, data[data$survey == 0, ])
#summary(c.att.out)

att.out <- lm(check ~ volunteer , data[data$survey == 0, ])
#summary(att.out)

########CRITERION 3: Skipping
data$skipped <- data$sk_1
data$skipped[data$skipped <5] <- 0
data$skipped[data$skipped == 5] <- 1
data$skipped[data$skipped == -99] <- NA

length(which(is.na(data$skipped)))
length(which(data$skipped == -99) | which(data$skipped == ""))

table(is.na(data$skipped), data$volunteer)

## Skipping Test 1 of 1
# without controls:
reg6 <- lm(skipped ~ volunteer, data)
#summary(reg6)

reg6.f <- lm(skipped ~ volunteer, data[data$survey==1, ]) 
#summary(reg6.f)

reg6.s <- lm(skipped ~ volunteer, data[data$survey==0, ])
#summary(reg6.s)

#added controls 
c2.reg6 <- lm(skipped ~ volunteer + age.years + income +  factor(race) + liberal + factor(partyID) + Education + survey, data)
#summary(c2.reg6)

c2.reg6.f <- lm(skipped ~ volunteer + age.years + income +  factor(race) + liberal + factor(partyID) + Education, data[data$survey==1, ])
#summary(c2.reg6.f)

c2.reg6.s <- lm(skipped ~ volunteer + age.years + income +  factor(race) + liberal + factor(partyID) + services  + prot + other_chr + other_non + Education, data[data$survey==0, ])
#summary(c2.reg6.s)

##########CRITERION 4: COMMITMENT (Defined as unsure = not committed)
data$commit1_recode <- data$commit_1 
data$commit1_recode[data$commit1_recode =="1" | data$commit1_recode =="2" | data$commit1_recode =="4" | data$commit1_recode =="5"] <- 0
data$commit1_recode[data$commit1_recode =="3"] <- 1
data$commit1_recode[data$commit1_recode == -99] <- NA
data$commit1_recode <- as.numeric(data$commit1_recode)

length(which(is.na(data$commit1_recode)))
length(which(data$commit1_recode == "" | which(data$commit1_recode == -99)))

data$commit2_recode <- data$commit_2
data$commit2_recode[data$commit2_recode =="1" | data$commit2_recode =="2" | data$commit2_recode =="4" | data$commit2_recode =="5"] <- 0
data$commit2_recode[data$commit2_recode =="3"] <- 1
data$commit2_recode[data$commit2_recode == -99] <- NA
data$commit2_recode <- as.numeric(data$commit2_recode)

length(which(is.na(data$commit2_recode)))
length(which(data$commit2_recode == "" | which(data$commit2_recode == -99)))

## COMMITMENT TEST 1 of 2  (answered unsure on commitment question #1)
# without controls
reg7 <- lm(commit1_recode ~ volunteer, data)
#summary(reg7)

reg7.f <- lm(commit1_recode ~ volunteer, data[data$survey ==1, ])
#summary(reg7.f)

reg7.s <- lm(commit1_recode ~ volunteer, data[data$survey ==0, ])
#summary(reg7.s)

#added controls: 
c2.reg7 <- lm(commit1_recode ~ volunteer + age.years + income +  factor(race) + liberal + factor(partyID) + Education + survey, data)
#summary(c2.reg7)

c2.reg7.f <- lm(commit1_recode ~ volunteer + age.years + income +  factor(race) + liberal + factor(partyID) + Education, data[data$survey ==1, ])
#summary(c2.reg7.f)

c2.reg7.s <- lm(commit1_recode ~ volunteer + age.years + income +  factor(race) + liberal + factor(partyID) + services + prot + other_chr+ other_non + Education, data[data$survey ==0, ])
#summary(c2.reg7.s)

## COMMITMENT TEST 2 of 2  (answered unsure on commitment question #2)
# without controls
reg8 <- lm(commit2_recode ~ volunteer, data)
#summary(reg8)

reg8.f <- lm(commit2_recode ~ volunteer, data[data$survey ==1, ])
#summary(reg8.f)

reg8.s <- lm(commit2_recode ~ volunteer, data[data$survey ==0, ])
#summary(reg8.s)

#added controls: 
c2.reg8 <- lm(commit2_recode ~ volunteer + age.years + income +  factor(race) + liberal + factor(partyID) + Education + survey, data)
#summary(c2.reg8) 

c2.reg8.f <- lm(commit2_recode ~ volunteer + age.years + income +  factor(race) + liberal + factor(partyID) + Education , data[data$survey ==1, ])
#summary(c2.reg8.f)

c2.reg8.s <- lm(commit2_recode ~ volunteer + age.years + income +  factor(race) + liberal + factor(partyID) + services + prot + other_chr+ other_non + Education, data[data$survey ==0, ])
#summary(c2.reg8.s)

## Average across commitment questions

data$commit_avg <- (data$commit1_recode + data$commit2_recode)/2

c2.reg14 <- lm(commit_avg ~ volunteer + age.years + income +  factor(race) + liberal + factor(partyID) + Education + survey, data)
#summary(c2.reg14) 

reg14 <- lm(commit_avg ~ volunteer, data)
#summary(reg14) 

##########CRITERION 5: CONSISTENCY
#####consistency data processing
###consistency, SEC only
data.sec.con <- data[!is.na(data$sl_SC_4_5),]
data.sec.con$con_prompt <- rep(0, nrow(data.sec.con))
data.sec.con$consist_comb <- rep(NA, nrow(data.sec.con))
for(i in 1:length(data.sec.con$consist_comb)){
  if(is.na(data.sec.con$consist_1[i]) == TRUE){
    data.sec.con$consist_comb[i] <- data.sec.con$consist_2[i]
    data.sec.con$con_prompt[i] <- 1 }else
    {data.sec.con$consist_comb[i] <- data.sec.con$consist_1[i]}
}

table(is.na(data.sec.con$consist_comb))
data.sec.con <- data.sec.con[!is.na(data.sec.con$consist_comb),]
data.sec.con$consistent <- 0

#coarsened version: by direction (1|2 = 1|2; 3=3 ; 4|5 = 4|5)
data.sec.con$consistent[data.sec.con$sl_SC_4_5 < 3 & data.sec.con$consist_comb < 3] <- 1 
data.sec.con$consistent[data.sec.con$sl_SC_4_5 == 3 & data.sec.con$consist_comb == 3] <- 1 
data.sec.con$consistent[data.sec.con$sl_SC_4_5 > 3 & data.sec.con$consist_comb > 3] <- 1 
table(data.sec.con$consistent)

###consistency, FEP
#To what extent are each of the following things good for America's national interest? 
#Tougher requirements for foreign investors
#1=very beneficial, 5 = very harmful
data.fep.con <- data[!is.na(data$sl_LI_7_6),]
table(data.fep.con$sl_LI_7_6)
data.fep.con$con_prompt <- rep(0, nrow(data.fep.con))
data.fep.con$consist_comb <- rep(NA, nrow(data.fep.con))
for(i in 1:length(data.fep.con$consist_comb)){
  if(is.na(data.fep.con$consist_1[i]) == TRUE){
    data.fep.con$consist_comb[i] <- data.fep.con$consist_2[i]
    data.fep.con$con_prompt[i] <- 1 }else
    {data.fep.con$consist_comb[i] <- data.fep.con$consist_1[i]}
}

table(is.na(data.fep.con$consist_comb))
data.fep.con <- data.fep.con[!is.na(data.fep.con$consist_comb),]
table(data.fep.con$con_prompt)

data.fep.con$consistent <- 0
#coarsened version: by direction (1|2 = 1|2; 3=3 ; 4|5 = 4|5)
data.fep.con$consistent[data.fep.con$sl_LI_7_6 < 3 & data.fep.con$consist_comb < 3] <- 1 
data.fep.con$consistent[data.fep.con$sl_LI_7_6 == 3 & data.fep.con$consist_comb == 3] <- 1 
data.fep.con$consistent[data.fep.con$sl_LI_7_6 > 3 & data.fep.con$consist_comb > 3] <- 1 
table(data.fep.con$consistent)

##re-combining these datasets for combined regression: 
data.remerge <- merge(data.fep.con, data.sec.con, all = TRUE)

table(is.na(data.remerge$consistent))
table(is.na(data.fep.con$consistent))
table(is.na(data.sec.con$consistent))

#### end consistency data processing

## CONSISTENCY TEST 1 of 1  (answered in consistent direction (coarsened) on substantively similar questions at beginning and end of survey)
# without controls
reg9 <- lm (consistent ~ volunteer, data.remerge)
#summary(reg9)

reg9.f <- lm(consistent ~ volunteer, data.fep.con)
#summary(reg9.f)

reg9.s <- lm(consistent ~ volunteer, data.sec.con)
#summary(reg9.s)

#added controls: 
c2.reg9 <- lm (consistent ~ volunteer + age.years + income +  factor(race) + liberal + factor(partyID) + Education + survey, data.remerge )
#summary(c2.reg9)

c2.reg9.f <- lm(consistent ~ volunteer + age.years + income +  factor(race) + liberal + factor(partyID) + Education, data.fep.con)
#summary(c2.reg9.f)

c2.reg9.s <- lm(consistent ~ volunteer + age.years + income +  factor(race) + liberal + factor(partyID) + services + prot + other_chr+ other_non + Education, data.sec.con)
#summary(c2.reg9.s)

##########CRITERION 6: OPEN-ENDED QUESTIONS

############################
##Getting Blanks and Missingness for the Open-ended questions:

data$oe1_skippers <- rep(0, nrow(data) )
data$oe1_skippers[which(data$oe_1 == -99)] <- 1  #Mturk version respondents who viewed question and skipped it.
data$oe1_skippers[which(data$oe_1 == "" & is.na(data$oe1_time_Page.Submit)== F)] <- 1  #DLABSS version respondents who viewed and skipped
table(data$oe_1 == -99)
table(data$oe1_skippers,  data$volunteer, data$survey)

data$oe1_droppers <- rep(0, nrow(data) )
data$oe1_droppers[is.na(data$oe1_time_Page.Submit)== T] <- 1  
table(is.na(data$oe1_time_Page.Submit))
table(data$oe1_droppers, data$volunteer, data$survey)

data$oe2_skippers <- rep(0, nrow(data) )
data$oe2_skippers[which(data$oe_2 == -99)] <- 1
data$oe2_skippers[which(data$oe_2 == "" & is.na(data$oe2_time_Page.Submit)== F)] <- 1
table(data$oe_2 == -99)
table(data$oe2_skippers,  data$volunteer, data$survey)

data$oe2_droppers <- rep(0, nrow(data) )
data$oe2_droppers[is.na(data$oe2_time_Page.Submit)== T] <- 1  
table(is.na(data$oe2_time_Page.Submit))
table(data$oe2_droppers, data$volunteer, data$survey)

#length of response to open-ended questions
data$oe_1_chars <-  nchar(as.character(data$oe_1))
data$oe_2_chars <-  nchar(as.character(data$oe_2))

## OPEN-ENDED TEST 1 OF 2 (number of characters on open-ended question 1)
# without controls
reg10 <- lm(oe_1_chars ~ volunteer, data = data[data$oe1_droppers == 0,  ] )
#summary(reg10)

reg10.f <- lm(oe_1_chars ~ volunteer, data = data[data$survey ==1 & data$oe1_droppers == 0, ] ) #excludes those who dropped prior to question
#summary(reg10.f)

reg10.s <- lm(oe_1_chars ~ volunteer, data = data[data$survey ==0 & data$oe1_droppers == 0, ] )
#summary(reg10.s)

#added controls:
c2.reg10 <- lm(oe_1_chars ~ volunteer + age.years + income +  factor(race) + liberal + factor(partyID) + Education + survey,  data[data$oe1_droppers == 0, ] )
#summary(c2.reg10)

c2.reg10.f <- lm(oe_1_chars ~ volunteer + age.years + income +  factor(race) + liberal + factor(partyID) + Education, data[data$survey ==1 & data$oe1_droppers == 0, ] )
#summary(c2.reg10.f)

c2.reg10.s <- lm(oe_1_chars ~ volunteer + age.years + income +  factor(race) + liberal + factor(partyID) + services + prot + other_chr + other_non + Education, data = data[data$survey ==0 & data$oe1_droppers == 0, ] )
#summary(c2.reg10.s)

## OPEN-ENDED TEST 2 OF 2 (number of characters on open-ended question 2)
# without controls
reg11 <- lm(oe_2_chars ~ volunteer, data = data[data$oe2_droppers == 0, ] )
#summary(reg11)

reg11.f <- lm(oe_2_chars ~ volunteer, data = data[data$survey ==1  & data$oe2_droppers == 0, ] )
#summary(reg11.f)

reg11.s <- lm(oe_2_chars ~ volunteer, data = data[data$survey ==0 & data$oe2_droppers == 0, ] )
#summary(reg11.s)

#added controls
c2.reg11 <- lm(oe_2_chars ~ volunteer + age.years + income +  factor(race) + liberal + factor(partyID) + Education + survey, data[data$oe2_droppers == 0, ] )
#summary(c2.reg11)

c2.reg11.f <- lm(oe_2_chars ~ volunteer +age.years + income +  factor(race) + liberal + factor(partyID) + Education, data[data$survey ==1 & data$oe2_droppers == 0, ] )
#summary(c2.reg11.f)

c2.reg11.s <- lm(oe_2_chars ~ volunteer + age.years + income +  factor(race) + liberal + factor(partyID) + services + prot + other_chr + other_non + Education , data[data$survey ==0 & data$oe2_droppers == 0, ] )
#summary(c2.reg11.s)

## average of open-ended responses
data$oe_avg <- (data$oe_1_chars + data$oe_2_chars)/2

reg12 <- lm(oe_avg ~ volunteer, data[data$oe2_droppers == 0,] )
#summary(reg12)

reg12.f <- lm(oe_avg ~ volunteer , data = data[data$survey ==1 & data$oe2_droppers == 0, ] )
#summary(reg12.f)

reg12.s <- lm(oe_avg ~ volunteer, data = data[data$survey ==0 & data$oe2_droppers == 0, ] )
#summary(reg12.s)

c2.reg12 <- lm(oe_avg ~ volunteer + age.years + income +  factor(race) + liberal + factor(partyID) + Education + survey, data[data$oe2_droppers == 0,] )
#summary(c2.reg12)

c2.reg12.f <- lm(oe_avg ~ volunteer +age.years + income +  factor(race) + liberal + factor(partyID) + Education, data = data[data$survey ==1 & data$oe2_droppers == 0, ] )
#summary(c2.reg12.f)

c2.reg12.s <- lm(oe_avg ~ volunteer + age.years + income +  factor(race) + liberal + factor(partyID) + services + prot + other_chr + other_non + Education , data = data[data$survey ==0 & data$oe2_droppers == 0, ] )
#summary(c2.reg12.s)

################################### 
##Open-ended quality and effort analysis
quality.out = lm(Quality~volunteer+ age.years + income + factor(race) + liberal + factor(partyID) + Education + survey,
                 data = data[data$oe1_droppers == 0,])
#summary(quality.out)

quality.out.f = lm(Quality~volunteer+ age.years + income + factor(race) + liberal + factor(partyID) + Education ,
                   data = data[data$oe1_droppers == 0 & data$survey ==1,])

quality.out.s = lm(Quality~volunteer+ age.years + income + factor(race) + liberal + factor(partyID)  + services + prot + other_chr + other_non  + Education ,
                   data = data[data$oe1_droppers == 0 & data$survey ==0,])

effort.out = lm(Effort~volunteer+ age.years + income + factor(race) + liberal + factor(partyID) + Education + survey,
                data = data[data$oe1_droppers == 0,])
#summary(effort.out)

effort.out.f = lm(Effort~volunteer+ age.years + income + factor(race) + liberal + factor(partyID) + Education ,
                  data = data[data$oe1_droppers == 0 & data$survey ==1,])

effort.out.s = lm(Effort~volunteer+ age.years + income + factor(race) + liberal + factor(partyID)  + services + prot + other_chr + other_non  + Education ,
                  data = data[data$oe1_droppers == 0 & data$survey ==0,])

#with controls

c.quality.out = lm(Quality~volunteer+ age.years + income + factor(race) + liberal + factor(partyID) + Education + survey,
                   data = data[data$oe1_droppers == 0,])
#summary(c.quality.out)

c.quality.out.f = lm(Quality~volunteer+ age.years + income + factor(race) + liberal + factor(partyID) + Education ,
                     data = data[data$oe1_droppers == 0 & data$survey ==1,])

c.quality.out.s = lm(Quality~volunteer+ age.years + income + factor(race) + liberal + factor(partyID)  + services + prot + other_chr + other_non  + Education ,
                     data = data[data$oe1_droppers == 0 & data$survey ==0,])

c.effort.out = lm(Effort~volunteer+ age.years + income + factor(race) + liberal + factor(partyID) + Education + survey,
                  data = data[data$oe1_droppers == 0,])
#summary(c.effort.out)

c.effort.out.f = lm(Effort~volunteer+ age.years + income + factor(race) + liberal + factor(partyID) + Education ,
                    data = data[data$oe1_droppers == 0 & data$survey ==1,])

c.effort.out.s = lm(Effort~volunteer+ age.years + income + factor(race) + liberal + factor(partyID)  + services + prot + other_chr + other_non  + Education ,
                    data = data[data$oe1_droppers == 0 & data$survey ==0,])

######################################################################
##CODE FOR COMBINED PLOT USING STANDARDIZED COEFFICIENTS

source("source_files/overall_plots.R")

#This code replaces combined values for reading and answering short items with FEP version only
betas_avgs[2:3, ] <- betas.f[3:4, ]

#overall plot for combined surveys:
coef.vec.1 <- betas_avgs$std.estimate
names.avg <- betas_avgs$names_avgs
error.vec.1 <- betas_avgs$std.error

#create data.frame for plotting
betas_df <- data.frame(term = names.avg,
                       estimate = coef.vec.1,
                       std.error = error.vec.1,
                       model = "Volunteer Coefficients",
                       stringsAsFactors = FALSE)

png(filename = "Fig2.png", width = 900, height = 600)
p <- dwplot(betas_df,
            dot_args = list(size = 3),
            whisker_args = list(lwd=1.25)
) +
  theme_bw() +
  geom_vline(xintercept = 0, colour = "grey60", linetype = 2) +
  theme(legend.position="none") +
  scale_colour_grey(start = 0, end = .7) +
  labs(x ="Standardized Coefficient on Volunteer (higher values = higher quality)")+
  theme(axis.text = element_text(size = 14),
        legend.text=element_text(size=14),
        legend.title = element_text(size=0),
        axis.title = element_text(size = 14),
        text = element_text(size=14))+
  theme(axis.title.x = element_text(margin = margin(t = 20, r = 0, b = 0, l = 0)))

# Add brackets
p %>% add_brackets(list(c("Time Investment", "answering open-ended items", "answering short items"), 
                        c("Open-Ended Questions", "response effort", "response quality")),
                   face = "bold")

dev.off()

##########################
## Figure 2, Main paper ##
##########################

###########################
## S1 Appendix; Figure A ##
###########################

# read in dlabss geolocation data
# NOTE: The latitude and longitude data has been rounded to two decimal to maintain anonymity
# so the maps produces by this code will not be an exact replication of the maps in the S1 Appendix
geolocation <- read.csv("datasets/dlabss_geolocation_june_2018.csv")

# subset to united states volunteers
us_geo <- geolocation[geolocation$country_code=='US',]

# create data.frame for plotting by latitude and longitude
us.df <- data.frame(Longitude = us_geo$longitude, Latitude = us_geo$latitude)

# give coordinates of map and the type of map to be used for plotting
us_map <- c(-125, 25, -65, 50)
map<-get_map(location=us_map, zoom=5, maptype = "toner-lite",
             source='stamen')

# plot us map as in S1 Appendix
ggmap(map) + 
  geom_point(aes(x = Longitude, y = Latitude), data = us.df,
             alpha = .75, size = 1.25) + 
  scale_color_identity() +
  theme_bw() +
  theme(axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks = element_blank())

###############################
## End S1 Appendix; Figure A ##
###############################

###########################
## S1 Appendix; Figure B ##
###########################

## now plot world map
# create data.frame for plotting with all geo data
world_dat <- data.frame(Longitude = geolocation$longitude, Latitude = geolocation$latitude)

# coordinates to be used for world map
world_map <- c(-179,-60,179,85)
# pull type of world map desired for plotting from google
map<-get_map(location=world_map, zoom=3, maptype = "terrain-background", color = 'bw',
             source='stamen')

# plot world map in S1 Appendix
ggmap(map) + ylim(-55,75) + xlim(-155,180) +
  geom_point(aes(x = Longitude, y = Latitude), data = world_dat,
             alpha = .75, size = 1) +
  ylab("") + xlab("")+
  theme(axis.title.x = element_text(size=14),
        axis.title.y = element_text(size=14),
        axis.text = element_text(size = 12),
        axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        axis.ticks = element_blank())

###############################
## End S1 Appendix; Figure B ##
###############################

##########################
## S2 Appendix; Table A ##
##########################

# read in data for table
dl_compare <- read.csv("datasets/S2_Appendix_Table_A.csv")

## Column 1: DLABSS demographic data that is older than data presented in Table 1 in the main paper ##

dl_compare_means <- rep(NA, 9)
dl_compare_ses <- rep(NA, 9)

# DLABSS female  mean and sd x100 to be put on percent scale
dl_compare_means[1] <- round(mean(dl_compare$Gender==2, na.rm = T)*100,1)
dl_compare_ses[1] <- round(SE.prop(dl_compare$Gender==2)*100,1)

dl_compare_means[2] <- round(mean(dl_compare$age.years, na.rm = T),1)
dl_compare_ses[2] <- round(SE(dl_compare$age.years),1)

dl_compare_means[3] <- round(mean(dl_compare$Education, na.rm = T),1)
dl_compare_ses[3] <- round(SE(dl_compare$Education),1)

## all race x100 to be put on percent scale
## DLABSS race white
dl_compare_means[4] <- round(mean(dl_compare$Race=="white",na.rm=T)*100,1)
dl_compare_ses[4] <- round(SE.prop(dl_compare$Race=="white")*100,1)

## DLABSS race black
dl_compare_means[5] <- round(mean(dl_compare$Race=="black",na.rm=T)*100,1)
dl_compare_ses[5] <- round(SE.prop(dl_compare$Race=="black")*100,1)

# DLABSS self-reported party identification from master file
## create a party ID variable from numeric codes
dl_compare$partyID <- NA
dl_compare$partyID[dl_compare$Party == 1] <- "Democrat"
dl_compare$partyID[dl_compare$Party == 2] <- "Republican"
dl_compare$partyID[dl_compare$Party == 3] <- "Independent"

# all x100 to be put on percent scale
# democrat
dl_compare_means[6] <- round(mean(dl_compare$partyID=='Democrat',na.rm = T)*100,1)
dl_compare_ses[6] <- round(SE.prop(dl_compare$partyID=='Democrat')*100,1)

# independent/other
dl_compare_means[7] <- round(mean(dl_compare$partyID=='Independent',na.rm = T)*100,1)
dl_compare_ses[7] <- round(SE.prop(dl_compare$partyID=='Independent')*100,1)

# republican
dl_compare_means[8] <- round(mean(dl_compare$partyID=='Republican',na.rm = T)*100,1)
dl_compare_ses[8] <- round(SE.prop(dl_compare$partyID=='Republican')*100,1)

# put means and SEs together nicely for table column
dl_compare_msd <- paste(dl_compare_means," (",dl_compare_ses,")",sep="")

# add N row
dl_compare_msd[9] <- "807-6,280"

## Column 2: mturk demographic data from same survey as above, #######################
## therefore for table creation we just take the values we need to populate table A ##
## Be aware, mt_msd was created above for Table 1 
mt_compare_msd <- mt_msd[c(1,3,2,6,7,18,19,20,25)]

## Column 3: Kam et al. (2007) Student samples ##
## NOTE: These values are taken directly from Berinsky et al. (2012) and therefore hardcoded into table

kam_student_means <- rep(NA,9)
kam_student_ses <- rep(NA,9)

kam_student_means[1] <- 56.7
kam_student_ses[1] <- 1.3

kam_student_means[2] <- 20.3
kam_student_ses[2] <- 8.2

kam_student_means[4] <- 42.5
kam_student_ses[4] <- ""

# put means and SEs together nicely for table column
kam_student_msd <- paste(kam_student_means," (",kam_student_ses,")",sep="")

kam_student_msd[9] <- "277-1,428"

# make sure appropriate rows are blank for table
kam_student_msd[c(3,5:8)] <- ""

## Column 4: Kam et al. (2007) Adult samples ##
## NOTE: These values are taken directly from Berinsky et al. (2012) and therefore hardcoded into table

kam_adult_means <- rep(NA,9)
kam_adult_ses <- rep(NA,9)

kam_adult_means[1] <- 75.7
kam_adult_ses[1] <- 4.1

kam_adult_means[2] <- 45.5
kam_adult_ses[2] <- 0.916

kam_adult_means[3] <- 5.48
kam_adult_ses[3] <- 1.29

kam_adult_means[4] <- 82.2
kam_adult_ses[4] <- 3.7

# put means and SEs together nicely for table column
kam_adult_msd <- paste(kam_adult_means," (",kam_adult_ses,")",sep="")

kam_adult_msd[9] <- "109"

# make sure appropriate rows are blank for table
kam_adult_msd[c(5:8)] <- ""

## Column 5: Experiment 1 from Berinsky & Kinder (2006) ##
## NOTE: These values are taken directly from Berinsky et al. (2012) and therefore hardcoded into table

bk1_means <- rep(NA,9)

bk1_means[1] <- 66.0

bk1_means[2] <- 42.5

bk1_means[3] <- 15.1

bk1_means[4] <- 81.4

bk1_means[5] <- 12.9

bk1_means[6] <- 46.1

bk1_means[7] <- 37.6

bk1_means[8] <- 16.3

bk1_msd <- paste(bk1_means, sep="")

bk1_msd[9] <- "141"

## Column 5: Experiment 2 from Berinsky & Kinder (2006) ##
## NOTE: These values are taken directly from Berinsky et al. (2012) and therefore hardcoded into table

bk2_means <- rep(NA,9)

bk2_means[1] <- 57.1

bk2_means[2] <- 45.3

bk2_means[3] <- 14.9

bk2_means[4] <- 72.4

bk2_means[5] <- 22.7

bk2_means[6] <- 46.5

bk2_means[7] <- 27.7

bk2_means[8] <- 25.8

bk2_msd <- paste(bk2_means, sep="")

bk2_msd[9] <- "163"


# now combine all columns with means and SEs
all_msd_compare <- cbind(dl_compare_msd, mt_compare_msd, kam_student_msd, kam_adult_msd, bk1_msd, bk2_msd)

# label columns for table
cols <- c("DLABSS", "MTurk", "Student samples", "Adult sample", 
          "Experiment 1", "Experiment 2")

# label rows for table
rows <- c("Female", "Age (years)", "Education (years)", "White", "Black",
          "Democrat", "Independent/Other", "Republican", "N")

# combine all together into matrix for xtable
S2_TabA <- matrix(all_msd_compare,9,6,dimnames=list(rows,cols))

# use xtable to produce latex code for table
print(xtable(S2_TabA, align = c("l","c","c","c","c","c","c")), file = getOption("S2_TableA.file", "S2_TableA"))

###############################
## End S2 Appendix; Table A ###
###############################

##########################
## S2 Appendix; Table B ##
##########################

# read in required datasets
dlabss_anesp <- read.csv("datasets/dlabss_anesp_replication.csv")
mturk_anesp <- read.csv("datasets/mturk_anesp_replication.csv")
## load in ANES 2012 Time Series data, rename, and remove old version
load(file = "datasets/anes_2012.rda")
anes12 <- da35157.0001 ; rm(da35157.0001)
## first subset to face-to-face respondents only consistent with Berinsky et al. (2012)
anes12ftf <- anes12[anes12$MODE == "(1) 1. FTF (face-to-face) mode",]

## Column 1: DLABSS Political interest and knowledge ##

# empty vectors for means and SEs
dl_poli_means <- rep(NA,9)
dl_poli_ses <- rep(NA,9)

## DLABSS political interest of a scale of 1-5, 5 indicating highest interest
dl_poli_means[1] <- round(mean(dlabss_anesp$interest, na.rm = T),2)
dl_poli_ses[1] <- round(SE(dlabss_anesp$interest),2)

################## DLABSS political knowledge

# all political knowledge questions have been coded as 1=correct 0=incorrect

# presidential succession
dl_poli_means[2] <- round(mean(dlabss_anesp$knowledge5, na.rm = T)*100,1)
dl_poli_ses[2] <- round(SE.prop(dlabss_anesp$knowledge5)*100,1)

# vote to override veto
dl_poli_means[3] <- round(mean(dlabss_anesp$knowledge6, na.rm = T)*100,1)
dl_poli_ses[3] <- round(SE.prop(dlabss_anesp$knowledge6)*100,1)

# presidential term limit
dl_poli_means[4] <- round(mean(dlabss_anesp$knowledge1, na.rm = T)*100,1)
dl_poli_ses[4] <- round(SE.prop(dlabss_anesp$knowledge1)*100,1)

# length of US senate term
dl_poli_means[5] <- round(mean(dlabss_anesp$knowledge2, na.rm = T)*100,1)
dl_poli_ses[5] <- round(SE.prop(dlabss_anesp$knowledge2)*100,1)

# senators per state
dl_poli_means[6] <- round(mean(dlabss_anesp$knowledge3, na.rm = T)*100,1)
dl_poli_ses[6] <- round(SE.prop(dlabss_anesp$knowledge3)*100,1)

# length of house term
dl_poli_means[7] <- round(mean(dlabss_anesp$knowledge4, na.rm = T)*100,1)
dl_poli_ses[7] <- round(SE.prop(dlabss_anesp$knowledge4)*100,1)

# combine all answers for average percent correct
all_knowledge <- c(dlabss_anesp$knowledge1, dlabss_anesp$knowledge2,
                   dlabss_anesp$knowledge3, dlabss_anesp$knowledge4,
                   dlabss_anesp$knowledge5, dlabss_anesp$knowledge6)

# average percent correct
dl_poli_means[8] <- round(mean(all_knowledge, na.rm = T)*100,1)
dl_poli_ses[8] <- ""

# put means and SEs together nicely for table column
dl_poli_msd <- paste(dl_poli_means," (",dl_poli_ses,")",sep="")

# create N row
dl_poli_msd[9] <- "1,071-1,097"


## Column 2: mturk Political interest and knowledge ##

# empty vectors for means and SEs
mt_poli_means <- rep(NA,9)
mt_poli_ses <- rep(NA,9)

## mturk political interest of a scale of 1-5, 5 indicating highest interest
mt_poli_means[1] <- round(mean(mturk_anesp$interest, na.rm = T),2)
mt_poli_ses[1] <- round(SE(mturk_anesp$interest),2)

################## mturk political knowledge

# all political knowledge questions have been coded as 1=correct 0=incorrect

# presidential succession
mt_poli_means[2] <- round(mean(mturk_anesp$knowledge5, na.rm = T)*100,1)
mt_poli_ses[2] <- round(SE.prop(mturk_anesp$knowledge5)*100,1)

# vote to override veto
mt_poli_means[3] <- round(mean(mturk_anesp$knowledge6, na.rm = T)*100,1)
mt_poli_ses[3] <- round(SE.prop(mturk_anesp$knowledge6)*100,1)

# presidential term limit
mt_poli_means[4] <- round(mean(mturk_anesp$knowledge1, na.rm = T)*100,1)
mt_poli_ses[4] <- round(SE.prop(mturk_anesp$knowledge1)*100,1)

# length of US senate term
mt_poli_means[5] <- round(mean(mturk_anesp$knowledge2, na.rm = T)*100,1)
mt_poli_ses[5] <- round(SE.prop(mturk_anesp$knowledge2)*100,1)

# senators per state
mt_poli_means[6] <- round(mean(mturk_anesp$knowledge3, na.rm = T)*100,1)
mt_poli_ses[6] <- round(SE.prop(mturk_anesp$knowledge3)*100,1)

# length of house term
mt_poli_means[7] <- round(mean(mturk_anesp$knowledge4, na.rm = T)*100,1)
mt_poli_ses[7] <- round(SE.prop(mturk_anesp$knowledge4)*100,1)

# combine all answers for average percent correct
all_knowledge <- c(mturk_anesp$knowledge1, mturk_anesp$knowledge2,
                   mturk_anesp$knowledge3, mturk_anesp$knowledge4,
                   mturk_anesp$knowledge5, mturk_anesp$knowledge6)

# average percent correct
mt_poli_means[8] <- round(mean(all_knowledge, na.rm = T)*100,1)
mt_poli_ses[8] <- ""

# put means and SEs together nicely for table column
mt_poli_msd <- paste(mt_poli_means," (",mt_poli_ses,")",sep="")

# create N row
mt_poli_msd[9] <- "673-705"

## Column 3: ANESP Political interest and knowledge from Berinsky et al. (2012) ##

## NOTE: Values come from Berinsky et al. (2012) and are therefore hard coded into the table

# empty vectors for means and SEs
bhl_poli_means <- rep(NA,9)
bhl_poli_ses <- rep(NA,9)

bhl_poli_means[1] <- 2.71
bhl_poli_ses[1] <- 0.02

bhl_poli_means[2] <- 65.2
bhl_poli_ses[2] <- 2.0

bhl_poli_means[3] <- 73.6
bhl_poli_ses[3] <- 1.3

bhl_poli_means[4] <- 92.8
bhl_poli_ses[4] <- 0.7

bhl_poli_means[5] <- 37.5
bhl_poli_ses[5] <- 1.3

bhl_poli_means[6] <- 73.2
bhl_poli_ses[6] <- 1.2

bhl_poli_means[7] <- 38.9
bhl_poli_ses[7] <- 1.3

bhl_poli_means[8] <- 63.5
bhl_poli_ses[8] <- ""

# put means and SEs together nicely for table column
bhl_poli_msd <- paste(bhl_poli_means," (",bhl_poli_ses,")",sep="")

# create N row
# NOTE: The N here differs from the N reported in Berinsky et al. (2012) Table 4 for the ANESP
# because there is less missingness among the political knowledge and interest variables than the
# other variables reported in their table not presented here such as party ID, ideology, need for 
# cognition and need to evaluate
bhl_poli_msd[9] <- "2,727-3,003"

## Column 4: ANES 2012 Political interest and knowledge ##

# empty vectors for means and SEs
anes_poli_means <- rep(NA,9)
anes_poli_ses <- rep(NA,9)

## ANES 2012 political interest

# make numeric values
lbls <- sort(levels(anes12ftf$INTEREST_ATTENTION))
lbls <- (sub("^\\([0-9]+\\) +(.+$)", "\\1", lbls))
anes12ftf$INTEREST_ATTENTION <- as.numeric(sub("^\\(0*([0-9]+)\\).+$", "\\1", anes12ftf$INTEREST_ATTENTION))
anes12ftf$INTEREST_ATTENTION <- add.value.labels(anes12ftf$INTEREST_ATTENTION, lbls)

## invert scale so 5 = very interested, 1 = not interested
anes12ftf$INTEREST_ATTENTION[anes12ftf$INTEREST_ATTENTION == 1] <- 55
anes12ftf$INTEREST_ATTENTION[anes12ftf$INTEREST_ATTENTION == 2] <- 44
anes12ftf$INTEREST_ATTENTION[anes12ftf$INTEREST_ATTENTION == 4] <- 2
anes12ftf$INTEREST_ATTENTION[anes12ftf$INTEREST_ATTENTION == 5] <- 1
anes12ftf$INTEREST_ATTENTION[anes12ftf$INTEREST_ATTENTION == 44] <- 4
anes12ftf$INTEREST_ATTENTION[anes12ftf$INTEREST_ATTENTION == 55] <- 5

# find mean and SE
anes_poli_means[1] <- round(wtd.mean(anes12ftf$INTEREST_ATTENTION, weights = anes12ftf$WEIGHT_FTF),2)
anes_poli_ses[1] <- round(sqrt(wtd.var(anes12ftf$INTEREST_ATTENTION, anes12ftf$WEIGHT_FTF))/sqrt(length(na.omit(anes12ftf$INTEREST_ATTENTION))),2)

# put means and SEs together nicely for table column
anes_poli_msd <- paste(anes_poli_means," (",anes_poli_ses,")",sep="")

# create N row
anes_poli_msd[9] <- "2,004-2,054"

# make sure NAs are changed to blank for the table
anes_poli_msd[2:8] <- ""

# now combine all columns with means and SEs
all_msd_poli <- cbind(dl_poli_msd, mt_poli_msd, bhl_poli_msd, anes_poli_msd)

# label columns for table
cols <- c("DLABSS", "MTurk", "ANESP", "ANES 2012")

# label rows for table
rows <- c("Political Interest", "Presidential succession after VP", "House vote to override veto",
          "Number of terms can be president", "Length U.S. Senate term", "Number of Senators per state",
          "Length U.S. House term","Average", "N")

# combine all together into matrix for xtable
S2_TabB <- matrix(all_msd_poli,9,4,dimnames=list(rows,cols))

# use xtable to produce latex code for table
print(xtable(S2_TabB, align = c("l","c","c","c","c")), file = getOption("S2_TableB.file", "S2_TableB"))


###############################
## End S2 Appendix; Table B ###
###############################

###########################
## S2 Appendix; Table C ###
###########################

## Column 1: DLABSS policy attitudes ##

## All policy questions are coded as 1=favor 0=oppose

# empty vectors for means and SEs
dl_atts_means <- rep(NA,7)
dl_atts_ses <- rep(NA,7)

# drug benefits for seniors
dl_atts_means[1] <- round(mean(dlabss_anesp$opinion4, na.rm = T)*100,1)
dl_atts_ses[1] <- round(SE.prop(dlabss_anesp$opinion4)*100,1)

# universal health care
dl_atts_means[2] <- round(mean(dlabss_anesp$opinion5, na.rm = T)*100,1)
dl_atts_ses[2] <- round(SE.prop(dlabss_anesp$opinion5)*100,1)

# process for citizenship
dl_atts_means[3] <- round(mean(dlabss_anesp$opinion6, na.rm = T)*100,1)
dl_atts_ses[3] <- round(SE.prop(dlabss_anesp$opinion6)*100,1)

# amendment banning gay marriage
dl_atts_means[4] <- round(mean(dlabss_anesp$opinion1, na.rm = T)*100,1)
dl_atts_ses[4] <- round(SE.prop(dlabss_anesp$opinion1)*100,1)

# raising taxes on those making more than $200k
dl_atts_means[5] <- round(mean(dlabss_anesp$opinion2, na.rm = T)*100,1)
dl_atts_ses[5] <- round(SE.prop(dlabss_anesp$opinion2)*100,1)

# raising taxes on those making less than $200k
dl_atts_means[6] <- round(mean(dlabss_anesp$opinion3, na.rm = T)*100,1)
dl_atts_ses[6] <- round(SE.prop(dlabss_anesp$opinion3)*100,1)


# put means and SEs together nicely for table column
dl_atts_msd <- paste(dl_atts_means," (",dl_atts_ses,")",sep="")

# create N row
dl_atts_msd[7] <- "1,041-1,044"


## Column 2: mturk policy attitudes ##

## All policy questions are coded as 1=favor 0=oppose

# empty vectors for means and SEs
mt_atts_means <- rep(NA,7)
mt_atts_ses <- rep(NA,7)

# drug benefits for seniors
mt_atts_means[1] <- round(mean(mturk_anesp$opinion4, na.rm = T)*100,1)
mt_atts_ses[1] <- round(SE.prop(mturk_anesp$opinion4)*100,1)

# universal health care
mt_atts_means[2] <- round(mean(mturk_anesp$opinion5, na.rm = T)*100,1)
mt_atts_ses[2] <- round(SE.prop(mturk_anesp$opinion5)*100,1)

# process for citizenship
mt_atts_means[3] <- round(mean(mturk_anesp$opinion6, na.rm = T)*100,1)
mt_atts_ses[3] <- round(SE.prop(mturk_anesp$opinion6)*100,1)

# amendment banning gay marriage
mt_atts_means[4] <- round(mean(mturk_anesp$opinion1, na.rm = T)*100,1)
mt_atts_ses[4] <- round(SE.prop(mturk_anesp$opinion1)*100,1)

# raising taxes on those making more than $200k
mt_atts_means[5] <- round(mean(mturk_anesp$opinion2, na.rm = T)*100,1)
mt_atts_ses[5] <- round(SE.prop(mturk_anesp$opinion2)*100,1)

# raising taxes on those making less than $200k
mt_atts_means[6] <- round(mean(mturk_anesp$opinion3, na.rm = T)*100,1)
mt_atts_ses[6] <- round(SE.prop(mturk_anesp$opinion3)*100,1)


# put means and SEs together nicely for table column
mt_atts_msd <- paste(mt_atts_means," (",mt_atts_ses,")",sep="")

# create N row
mt_atts_msd[7] <- "703-705"

## Column 3: ANESP policy attitudes from Berinsky et al. (2012) ##

## NOTE: Values come from Berinsky et al. (2012) and are therefore hard coded into the table

# empty vectors for means and SEs
bhl_atts_means <- rep(NA,7)
bhl_atts_ses <- rep(NA,7)

bhl_atts_means[1] <- 74.8
bhl_atts_ses[1] <- 1.1

bhl_atts_means[2] <- 41.7
bhl_atts_ses[2] <- 1.2

bhl_atts_means[3] <- 42.7
bhl_atts_ses[3] <- 1.2

bhl_atts_means[4] <- 30.7
bhl_atts_ses[4] <- 1.2

bhl_atts_means[5] <- 55.4
bhl_atts_ses[5] <- 1.2

bhl_atts_means[6] <- 7.1
bhl_atts_ses[6] <- 0.6

# put means and SEs together nicely for table column
bhl_atts_msd <- paste(bhl_atts_means," (",bhl_atts_ses,")",sep="")

# create N row
bhl_atts_msd[7] <- "1,614-1,618"

## Column 4: ANES 2012 policy attitudes ##

## ANES 2012 policy attitudes

anes_atts_means <- rep(NA,7)
anes_atts_ses <- rep(NA,7)

## ANES citizenship process
lbls <- sort(levels(anes12ftf$IMMIG_CITIZEN))
lbls <- (sub("^\\([0-9]+\\) +(.+$)", "\\1", lbls))
anes12ftf$IMMIG_CITIZEN <- as.numeric(sub("^\\(0*([0-9]+)\\).+$", "\\1", anes12ftf$IMMIG_CITIZEN))
anes12ftf$IMMIG_CITIZEN <- add.value.labels(anes12ftf$IMMIG_CITIZEN, lbls)

anes12ftf$IMMIG_CITIZEN[anes12ftf$IMMIG_CITIZEN == 3] <- 2

anes_atts_means[3] <- round(wtd.mean(anes12ftf$IMMIG_CITIZEN == 1, weights = anes12ftf$WEIGHT_FTF)*100,1)
anes_atts_ses[3] <- round(sqrt(wtd.var(anes12ftf$IMMIG_CITIZEN == 1, anes12ftf$WEIGHT_FTF))/sqrt(length(na.omit(anes12ftf$IMMIG_CITIZEN)))*100,1)

## ANES gay marriage
lbls <- sort(levels(anes12ftf$GAYRT_MARRY))
lbls <- (sub("^\\([0-9]+\\) +(.+$)", "\\1", lbls))
anes12ftf$GAYRT_MARRY <- as.numeric(sub("^\\(0*([0-9]+)\\).+$", "\\1", anes12ftf$GAYRT_MARRY))
anes12ftf$GAYRT_MARRY <- add.value.labels(anes12ftf$GAYRT_MARRY, lbls)

anes_atts_means[4] <- round(wtd.mean(anes12ftf$GAYRT_MARRY != 1, weights = anes12ftf$WEIGHT_FTF)*100,1)
anes_atts_ses[4] <- round(sqrt(wtd.var(anes12ftf$GAYRT_MARRY != 1, anes12ftf$WEIGHT_FTF))/sqrt(length(na.omit(anes12ftf$GAYRT_MARRY)))*100,1)

## ANES tax on millionaries
lbls <- sort(levels(anes12ftf$MILLN_MILLTAX))
lbls <- (sub("^\\([0-9]+\\) +(.+$)", "\\1", lbls))
anes12ftf$MILLN_MILLTAX <- as.numeric(sub("^\\(0*([0-9]+)\\).+$", "\\1", anes12ftf$MILLN_MILLTAX))
anes12ftf$MILLN_MILLTAX <- add.value.labels(anes12ftf$MILLN_MILLTAX, lbls)

anes_atts_means[5] <- round(wtd.mean(anes12ftf$MILLN_MILLTAX == 1, weights = anes12ftf$WEIGHT_FTF)*100,1)
anes_atts_ses[5] <- round(sqrt(wtd.var(anes12ftf$MILLN_MILLTAX == 1, anes12ftf$WEIGHT_FTF))/sqrt(length(na.omit(anes12ftf$MILLN_MILLTAX)))*100,1)

# put means and SEs together nicely for table column
anes_atts_msd <- paste(anes_atts_means," (",anes_atts_ses,")",sep="")

# create N row
anes_atts_msd[7] <- "1,995-2,026"

# make sure NAs are blank for table presentation
anes_atts_msd[c(1:2,6)] <- ""

# add asterisks that correspond to table's footnote
anes_atts_msd[4] <- paste0(anes_atts_msd[4],"*")
anes_atts_msd[5] <- paste0(anes_atts_msd[5],"**")

# now combine all columns with means and SEs
all_msd_atts <- cbind(dl_atts_msd, mt_atts_msd, bhl_atts_msd, anes_atts_msd)

# label columns for table
cols <- c("DLABSS", "MTurk", "ANESP", "ANES 2012")

# label rows for table
rows <- c("Favor prescription drug benefits", "Favor universal health care", "Favor citizenship process", 
          "Favor amendment banning gay marriage", "Favor raising taxes more than \\$200k",
          "Favor raising taxes less than \\$200k","N")

# combine all together into matrix for xtable
S2_TabC <- matrix(all_msd_atts,7,4,dimnames=list(rows,cols))

# use xtable to produce latex code for table
print(xtable(S2_TabC, align = c("l","c","c","c","c")), file = getOption("S2_TableC.file", "S2_TableC"))

###############################
## End S2 Appendix; Table C ###
###############################

##########################
## S3 Appendix; Table A ##
##########################

# read required data for replications in Tables A and C
dlabss_replications <- read.csv("datasets/S3_Appendix_Table_A.csv")

##################################
## Rasinski 1989 (Welfare/Poor) ##
##################################

# convert characters to numeric
dlabss_replications$welfare <- as.numeric(as.character(dlabss_replications$welfare))
dlabss_replications$poor <- as.numeric(as.character(dlabss_replications$poor))

# rename vector that tells which condition was presented to participants
names(dlabss_replications)[which(names(dlabss_replications) == "RO.BL.Welfare")] <- "Rasinski1989"

# find number of people who support welfare
support.welfare <- table(dlabss_replications$welfare)[1] 

# get total people asked about welfare
total.welfare <- sum(table(dlabss_replications$welfare))

# find number of people who support assistance to poor
assist.poor <- table(dlabss_replications$poor)[1]

# get total people asked about assisting poor
total.poor <- sum(table(dlabss_replications$poor))

# get percent of people supporting each option (the denominator contains "I don't know" as well)
support.welfare.percent <- round(support.welfare/total.welfare*100,0)
assist.poor.percent <- round(assist.poor/total.poor*100,0)

# difference in percent
poor.welfare.diff <- assist.poor.percent - support.welfare.percent

# t-test to obtain p-value
t.test(dlabss_replications$welfare[dlabss_replications$Rasinski1989 == "welfare"], 
       dlabss_replications$poor[dlabss_replications$Rasinski1989 == "poor"])

#We also explored whether these treatment
#effects were heterogeneous. In both the 2010 GSS and the MTurk sample, the experimental effect does
#not differ reliably across gender, education, or racial lines. Furthermore, a test of the difference in the
#estimated treatment effect by demographic group crossed with differences in experimental administration
#(MTurk versus GSS) also yields null results

# create columns for table, with our results from above calculations and results from
# others hard coded into table
Poor <- c(assist.poor.percent, 64, 55)
Welfare <- c(support.welfare.percent, 23, 17)
Difference <- c(poor.welfare.diff, 37, 38)
p <- c("<.001", "<.001","<.001")
n <- c(788, 1470, 329)
Platforms <- c("DLABSS","General Social Surveys (GSS)", "MTurk")

# label columns for table
cols <- c("Platform", "Poor", "Welfare", "Difference", "p", "n")

# we want empty row names here
rows <- rep("",3)

# combine all together into matrix for xtable
S3_TabA <- matrix(cbind(Platforms,Poor,Welfare,Difference,p,n),3,6,dimnames = list(rows,cols))

# use xtable to produce latex code for table (we don't want row names here)
print(xtable(S3_TabA, align = c("l","l","c","c","c","c","c")), include.rownames = FALSE, file = getOption("S3_TableA.file", "S3_TableA"))


###############################
## End S3 Appendix; Table A ###
###############################

##########################
## S3 Appendix; Table B ##
##########################

# read required data for Table B
s3tableB <- read.csv("datasets/S3_Appendix_Table_B.csv")

#######################################################
## Tversky and Kahneman 1981 (Asian Disease Problem) ##
#######################################################

################################################
##Tversky and Kahneman 1981 (Asian Disease Problem)
################################################

# Die frame: 92/270 (34.1%) choose certainty policy
die.frame <- round(table(s3tableB$QID11)[2]/table(s3tableB$QID11)[1]*100,0)

##Save frame: 176/279 (63.1%) choose certainty policy
save.frame <- round(table(s3tableB$QID13)[2]/table(s3tableB$QID13)[1]*100,0)

save.die.diff <- save.frame - die.frame

# create variables for t-test and t-test to get p-value
s3tableB$ttest <- ifelse(s3tableB$QID11 == "Program A", 1,0)
s3tableB$ttest2 <- ifelse(s3tableB$QID13 == "Program A", 1,0)
t.test(s3tableB$ttest, s3tableB$ttest2)

# create columns for table, with our results from above calculations and results from
# others hard coded into table
Platform <- c("DLABSS", "MTurk (Berinsky et al. 2012)", "Tversky and Kahneman 1981")
LivesLost <- c(die.frame, 38, 22)
LivesSaved <- c(save.frame, 74, 72)
Difference <- c(save.die.diff, 36, 50)
p <- c("<.001", "<.001","<.001")
n <- c(539, 450, 307)

# label columns for table
cols <- c("Platform", "Lives Saved", "Lives Lost", "Difference", "p", "n")

# we want empty row names here
rows <- rep("",3)

# combine all together into matrix for xtable
S3_TabB <- matrix(cbind(Platform,LivesSaved,LivesLost,Difference,p,n),3,6,dimnames = list(rows,cols))

# use xtable to produce latex code for table (we don't want row names here)
print(xtable(S3_TabB, align = c("l","l","c","c","c","c","c")), include.rownames = FALSE, file = getOption("S3_TableB.file", "S3_TableB"))


###############################
## End S3 Appendix; Table B ###
###############################

###########################################
## S3 Appendix; Table C:  DLABSS Portion ##
###########################################

# read required data for Table C
s3tableC <- read.csv("datasets/S3_Appendix_Table_C.csv")

#####################################################################################
# Kam and Simas (2010): individuals with higher levels of risk acceptance are 
# more likely to prefer probabilistic outcomes
#####################################################################################

# select respondents who received Kam and Simas (2010) prompt
s3tableC <- subset(s3tableC, is.na(s3tableC$asian) == TRUE) 

# sum across rows to begin building risk acceptance variable
s3tableC$intro1 <- rowSums(s3tableC[,c("intro1s_1", "intro1d_1")], na.rm=T) 
s3tableC$intro2 <- rowSums(s3tableC[,c("intro2s", "intro2d")], na.rm=T) 
s3tableC$intro3 <- rowSums(s3tableC[,c("intro3s", "intro3d")], na.rm=T) 
s3tableC$intro4 <- rowSums(s3tableC[,c("intro4s", "intro4d")], na.rm=T) 
s3tableC$intro5 <- rowSums(s3tableC[,c("intro5s", "intro5d")], na.rm=T) 
s3tableC$intro6 <- rowSums(s3tableC[,c("intro6s", "intro6d")], na.rm=T) 
s3tableC$intro7 <- rowSums(s3tableC[,c("intro7s", "intro7d")], na.rm=T) 

# drop missing observations
s3tableC <- s3tableC[s3tableC$intro1 != 0,]

# remove people with missing relevant demographic data
s3tableC <- s3tableC[!is.na(s3tableC$Salary),]
s3tableC <- s3tableC[!is.na(s3tableC$age.years),]
s3tableC <- s3tableC[!is.na(s3tableC$Education),]

## see pp. 386 of Kam and Simas 2010 for instructions on how to build risk acceptance variable
## intro2 and intro7 need to be inverted (to ensure higher value = more risk acceptance)
s3tableC$intro2 <- 6-s3tableC$intro2 
s3tableC$intro7 <- 5-s3tableC$intro7
s3tableC$intro1 <- (s3tableC$intro1 - min(s3tableC$intro1))/(max(s3tableC$intro1) - min(s3tableC$intro1))
s3tableC$intro2 <- (s3tableC$intro2 - min(s3tableC$intro2))/(max(s3tableC$intro2) - min(s3tableC$intro2))
s3tableC$intro3 <- (s3tableC$intro3 - min(s3tableC$intro3))/(max(s3tableC$intro3) - min(s3tableC$intro3))
s3tableC$intro4 <- (s3tableC$intro4 - min(s3tableC$intro4))/(max(s3tableC$intro4) - min(s3tableC$intro4))
s3tableC$intro5 <- (s3tableC$intro5 - min(s3tableC$intro5))/(max(s3tableC$intro5) - min(s3tableC$intro5))
s3tableC$intro6 <- (s3tableC$intro6 - min(s3tableC$intro6))/(max(s3tableC$intro6) - min(s3tableC$intro6))
s3tableC$intro7 <- (s3tableC$intro7 - min(s3tableC$intro7))/(max(s3tableC$intro7) - min(s3tableC$intro7))
s3tableC$RA <- rowSums(s3tableC[,c("intro1", "intro2", "intro3", "intro4", 
                                   "intro5", "intro6", "intro7")], na.rm=T) 
s3tableC$RA <- (s3tableC$RA - min(s3tableC$RA))/(max(s3tableC$RA) - min(s3tableC$RA))

# For risk acceptance, DLABSS mean is .51, Kam and Simas is .45
mean(s3tableC$RA)


## normalize the quantitative variables for regressions
s3tableC$Salary <- (s3tableC$Salary - min(s3tableC$Salary)) / (max(s3tableC$Salary) - min(s3tableC$Salary))
s3tableC$Education <- (s3tableC$Education - min(s3tableC$Education)) / (max(s3tableC$Education) - min(s3tableC$Education))
s3tableC$age.years <- (s3tableC$age.years - min(s3tableC$age.years)) / (max(s3tableC$age.years) - min(s3tableC$age.years))

# create dummy variable for mortality frame
s3tableC$dieframe <- ifelse(is.na(s3tableC$die1) == FALSE, 1,0)

# create outcome variable
s3tableC$prefprob <- rowSums(s3tableC[,c("die1", "save1")], na.rm=T) 
s3tableC$prefprob <- ifelse(s3tableC$prefprob == 2, 1,0)

# binary indicators for female and ideology
s3tableC$Female <- ifelse(s3tableC$Gender == 2, 1,0)
s3tableC$Ideology <- ifelse(s3tableC$Ideology == 2, 1,0)

# H1a probit regression
H1a <- glm(prefprob ~ dieframe + RA, family=binomial(link="probit"), 
           data=s3tableC)

# H1b probit regression
H1b <- glm(prefprob ~ dieframe + RA + Female + age.years +
             Education + Salary + Ideology, family=binomial(link="probit"), 
           data=s3tableC)

# H2 probit regression
H2 <- glm(prefprob ~ dieframe + RA + RA*dieframe, family=binomial(link="probit"), 
          data=s3tableC)

# combine regressions into stargazer for latex output
capture.output(stargazer(H1a, H1b, H2,
          dep.var.labels.include = FALSE,
          model.numbers          = FALSE,
          dep.var.caption = "DLABSS",
          omit.stat = c("ll","aic"),
          star.char = c("", "", ""),
          column.labels = c("H1a", "H1b", "H2"),
          covariate.labels = c("Mortality frame in Trial 1", "Risk Acceptance", "Female",
                               "Age", "Education", "Income", "Ideology", 
                               "Risk Acceptance x Mortality frame"),
          notes = NULL, notes.label = NULL, out = "S3_TableC"))

###############################################
## End S3 Appendix; Table C:  DLABSS Portion ##
###############################################

###############################################################
## S3 Appendix; Figure A: Replication of Hainmueller and Hiscox
###############################################################

# load required dataset
d <- read.csv("datasets/immigration_dlabss_replication.csv")
m <- read.csv("datasets/immigration_mturk_replication.csv")

# drop missinfg cases
d <- d[complete.cases(d[17]),]

# assign condition
d$Randomizer <- as.character(d$Randomizer)
d$Randomizer[d$Randomizer == "Q1"] <- 1
d$Randomizer[d$Randomizer == "Q2"] <- 2
d$Randomizer <- as.numeric(d$Randomizer, na.rm = TRUE)

################################################
##Recreate first two plots from Hainmueller and Hiscox 2010
################################################

################################################
##For high-skilled frame
################################################
dh <- subset(d, d$Randomizer == 1)
d.p <-matrix(NA, nrow = nrow(dh), ncol = 5)
d.p <- as.data.frame(d.p)
d.p[1] <- ifelse(dh$High == 1, 1, 0)
d.p[2] <- ifelse(dh$High == 2, 1, 0)
d.p[3] <- ifelse(dh$High == 3, 1, 0)
d.p[4] <- ifelse(dh$High == 4, 1, 0)
d.p[5] <- ifelse(dh$High == 5, 1, 0)
d.p <- na.omit(d.p)
m.h <- apply(d.p, 2, mean)

################################################
##For low-skilled frame
################################################
dl <- subset(d, d$Randomizer == 2)
d.pl <-matrix(NA, nrow = nrow(dl), ncol = 5)
d.pl <- as.data.frame(d.pl)
d.pl[1] <- ifelse(dl$Low == 1, 1, 0)
d.pl[2] <- ifelse(dl$Low == 2, 1, 0)
d.pl[3] <- ifelse(dl$Low == 3, 1, 0)
d.pl[4] <- ifelse(dl$Low == 4, 1, 0)
d.pl[5] <- ifelse(dl$Low == 5, 1, 0)
d.pl <- na.omit(d.pl)
m.l <- apply(d.pl, 2, mean)

################################################
##Function for 95% CI for upper bound (Not Taylor Series)
##These CI's are for population proportion/percentage
################################################
ci.func <- function(m, n) {
  m + 1.96 * sqrt((m*(1-m)/n))
}
high.ci <- ci.func(m.h, nrow(dh)) 
low.ci <- ci.func(m.l, nrow(dl)) 

################################################
##Hainmueller and Hiscox Figure 2
################################################

################################################
##top half: highly skilled
################################################
#var <- c("Strongly disagree", "Disagree", "Neutral", "Agree", "Strongly agree")
var <- c(1:5)
f2.h <- cbind(var,m.h, high.ci)
f2.h <- as.data.frame(f2.h)

dodge <- position_dodge(width = 0.9)
limits <- aes(ymax = high.ci,
              ymin = m.h)

p <- ggplot(data = f2.h, aes(x = var, y = m.h, fill = factor(var))) +
  geom_bar(stat = "identity", position = dodge) +
  geom_errorbar(limits, position = dodge, width=.2) +
  coord_flip() +
  labs(y = NULL, x = "Highly skilled",  
       color = "Legend Title\n") +
  scale_x_continuous(breaks=NULL) +
  scale_fill_grey(name="",
                  breaks=c("5", "4", "3", "2", "1"),
                  labels=c("Strongly Agree", "Agree", "Neither Agree nor Disagree", "Disagree", "Strongly Disagree"),
                  start=0.8, end=0) +
  theme_bw() +
  theme(plot.title = element_text(size=28),
        axis.title=element_text(size=20,face="bold"),
        axis.text=element_text(size=20),
        legend.text=element_text(size=26)) +
  ylim(0, .4) +
  theme(legend.position="none") +
  labs(title = "DLABSS Results")

################################################
##bottom half: low-skilled
################################################  
f2.l <- cbind(var,m.l, low.ci)
f2.l <- as.data.frame(f2.l)
limits.2 <- aes(ymax = low.ci,
                ymin = m.l)
f2.l$var <- as.factor(f2.l$var)

p.2 <- ggplot(data = f2.l, aes(x = var, y = m.l, fill = factor(var))) +
  geom_bar(stat = "identity", position = dodge) +
  geom_errorbar(limits.2, position = dodge, width=.2) +
  coord_flip() +
  labs(y = NULL, x = "Low skilled",  
       color = "Legend Title\n") +
  scale_x_discrete(breaks=NULL) +
  scale_fill_grey(name="",
                  breaks=c("5", "4", "3", "2", "1"),
                  labels=c("Strongly Agree", "Agree", "Neither Agree nor Disagree", "Disagree", "Strongly Disagree"),
                  start=0.8, end=0) +
  theme_bw() +
  theme(plot.title = element_text(size=28),
        axis.title=element_text(size=20,face="bold"),
        axis.text=element_text(size=20),
        legend.text=element_text(size=26)) +
  ylim(0, .4) +
  theme(legend.position="none") 


multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
  plots <- c(list(...), plotlist)
  numPlots = length(plots)
  
  # If layout is NULL, then use 'cols' to determine layout
  if (is.null(layout)) {
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                     ncol = cols, nrow = ceiling(numPlots/cols))}
  if (numPlots==1) {
    print(plots[[1]])} else {
      # Set up the page
      grid.newpage()
      pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
      
      # Make each plot, in the correct location
      for (i in 1:numPlots) {
        # Get the i,j matrix positions of the regions that contain this subplot
        matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
        
        print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                        layout.pos.col = matchidx$col))
      }
    }
}
multiplot(p, p.2, cols=1)

####################################
# MTURK Portion
####################################

# drop missing
m$Q1 <- as.character(m$Q1)
mh <- subset(m, m$Q1 != "") ##n = 429
ml <- subset(m, m$Q2.1 != "") ##n = 402

##################
## Highly-Skilled
##################
m.p <-matrix(NA, nrow = nrow(mh), ncol = 5)
m.p <- as.data.frame(m.p)
mh$Q1 <- as.character(mh$Q1)
m.p[1] <- ifelse(mh$Q1 == "Strongly disagree", 1, 0)
m.p[2] <- ifelse(mh$Q1 == "Somewhat disagree", 1, 0)
m.p[3] <- ifelse(mh$Q1 == "Neither agree nor disagree", 1, 0)
m.p[4] <- ifelse(mh$Q1 == "Somewhat agree", 1, 0)
m.p[5] <- ifelse(mh$Q1 == "Strongly agree", 1, 0)
m.p <- na.omit(m.p)
m.h.m <- apply(m.p, 2, mean)

##################
## Highly-Skilled
##################
m.p.l <-matrix(NA, nrow = nrow(ml), ncol = 5)
m.p.l <- as.data.frame(m.p.l)
ml$Q2.1 <- as.character(ml$Q2.1)
m.p.l[1] <- ifelse(ml$Q2.1 == "Strongly disagree", 1, 0)
m.p.l[2] <- ifelse(ml$Q2.1 == "Somewhat disagree", 1, 0)
m.p.l[3] <- ifelse(ml$Q2.1 == "Neither agree nor disagree", 1, 0)
m.p.l[4] <- ifelse(ml$Q2.1 == "Somewhat agree", 1, 0)
m.p.l[5] <- ifelse(ml$Q2.1 == "Strongly agree", 1, 0)
m.p.l <- na.omit(m.p.l)
m.l.m <- apply(m.p.l, 2, mean)

# confidence intervals using function from above
high.ci <- ci.func(m.h.m, nrow(mh)) 
low.ci <- ci.func(m.l.m, nrow(ml)) 

var <- c(1:5)
f2.h <- cbind(var,m.h.m, high.ci)
f2.h <- as.data.frame(f2.h)
f2.l <- cbind(var,m.l.m, low.ci)
f2.l <- as.data.frame(f2.l)

dodge <- position_dodge(width = 0.9)
limits <- aes(ymax = high.ci,
              ymin = m.h.m)
limits.l <- aes(ymax = low.ci,
                ymin = m.l.m)

p.m <- ggplot(data = f2.h, aes(x = var, y = m.h.m, fill = factor(var))) +
  geom_bar(stat = "identity", position = dodge) +
  geom_errorbar(limits, position = dodge, width=.2) +
  coord_flip() +
  labs(y = NULL, x = "Highly skilled",  
       color = "Legend Title\n") +
  scale_x_continuous(breaks=NULL) +
  scale_fill_grey(name="",
                  breaks=c("5", "4", "3", "2", "1"),
                  labels=c("Strongly Agree", "Agree", "Neither Agree nor Disagree", "Disagree", "Strongly Disagree"),
                  start=0.8, end=0) +
  theme_bw() +
  theme(plot.title = element_text(size=28),
        axis.title=element_text(size=20,face="bold"),
        axis.text=element_text(size=20),
        legend.text=element_text(size=26)) +
  ylim(0, .4) +
  theme(legend.position="none")  + 
  labs(title = "MTurk Results")



p.l <- ggplot(data = f2.l, aes(x = var, y = m.l.m, fill = factor(var))) +
  geom_bar(stat = "identity", position = dodge) +
  geom_errorbar(limits.l, position = dodge, width=.2) +
  coord_flip() +
  labs(y = NULL, x = "Low skilled",  
       color = "Legend Title\n") +
  scale_x_continuous(breaks=NULL) +
  scale_fill_grey(name="",
                  breaks=c("5", "4", "3", "2", "1"),
                  labels=c("Strongly Agree", "Agree", "Neither Agree nor Disagree", "Disagree", "Strongly Disagree"),
                  start=0.8, end=0) +
  theme_bw() +
  theme(plot.title = element_text(size=28),
        axis.title=element_text(size=20,face="bold"),
        axis.text=element_text(size=20),
        legend.text=element_text(size=26)) +
  ylim(0, .4) +
  theme(legend.position="bottom")

# combine plots for both DLABSS and MTurk into one and plot
plot_grid(p, p.2, p.m, p.l, nrow = 4)

######################################################################
## End S3 Appendix; Figure A: Replication of Hainmueller and Hiscox ##
######################################################################

###########################################
## S3 Appendix; Table D: DLABSS Portion ##
##########################################

################################################
##Tomz 2007
################################################

# read required data
tomz <- read.csv("datasets/Tomz_2007_dlabss_replication.csv")

## create k levels of approval as dummy variable used for theta values in
## dirichlet distribution

## strongly approve
tomz$strong.app <- 0
tomz$strong.app[tomz$Q5a == 1 & tomz$Q5b == 1] <- 1

## somewhat approve
tomz$somewhat.app <- 0
tomz$somewhat.app[tomz$Q5a == 1 & tomz$Q5b == 2] <- 1

## strongly disapprove
tomz$strong.dis <- 0
tomz$strong.dis[tomz$Q5a == 2 & tomz$Q5c == 1] <- 1

## somewhat disapprove
tomz$somewhat.dis <- 0
tomz$somewhat.dis[tomz$Q5a == 2 & tomz$Q5c == 2] <- 1

## lean approve
tomz$lean.app <- 0
tomz$lean.app[tomz$Q5a == 3 & tomz$Q5d == 1] <- 1

## lean disapprove
tomz$lean.dis <- 0
tomz$lean.dis[tomz$Q5a == 3 & tomz$Q5d == 2] <- 1

## don't lean
tomz$no.lean <- 0
tomz$no.lean[tomz$Q5a == 3 & tomz$Q5d == 3] <- 1

## subset data for stay out and empty threat scenarios so probability of falling 
## into each of the separate approval categories can be calculated separately 
## for each scenario and be compared
StayOut <- subset(tomz, tomz$P1 == 1)
EmptyThreat <- subset(tomz, tomz$P2 == 1)

## calculate theta for each approval category to be used as dirichlet parameters
SO.theta <- colSums(StayOut[,c("strong.app","somewhat.app","strong.dis","somewhat.dis",
                               "lean.app","lean.dis","no.lean")])
ET.theta <- colSums(EmptyThreat[,c("strong.app","somewhat.app","strong.dis","somewhat.dis",
                                   "lean.app","lean.dis","no.lean")])

## alpha parameter of dirichlet distribution
alpha <- 0.5

## simulate from dirichlet distribution to get the posterior probabilities for each approval level
## for each of the different scenarios
set.seed(2138)
SO.posterior <- rdirichlet(10000, SO.theta+alpha)
ET.posterior <- rdirichlet(10000, ET.theta+alpha)

# empty vectors for strongly disapprove means and credible intervals
dis.strong <- rep(NA, 4)
dis.strong.lci <- rep(NA, 4)
dis.strong.uci <- rep(NA, 4)

## strongly disapprove posterior mean and CI empty threat
dis.strong[1] <- round(mean(ET.posterior[,3])*100,0)
dis.strong.lci[1] <- round(quantile(ET.posterior[,3], c(0.025,0.975))*100,0)[1]
dis.strong.uci[1] <- round(quantile(ET.posterior[,3], c(0.025,0.975))*100,0)[2]

## strongly disapprove posterior mean and CI stay out
dis.strong[2] <- round(mean(SO.posterior[,3])*100,0)
dis.strong.lci[2] <- round(quantile(SO.posterior[,3], c(0.025,0.975))*100,0)[1]
dis.strong.uci[2] <- round(quantile(SO.posterior[,3], c(0.025,0.975))*100,0)[2]

## strongly disapprove posterior mean and CI difference
dis.strong[3] <- round(mean(ET.posterior[,3]-SO.posterior[,3])*100,0)
dis.strong.lci[3] <- round(quantile(ET.posterior[,3]-SO.posterior[,3], c(0.025,0.975))*100,0)[1]
dis.strong.uci[3] <- round(quantile(ET.posterior[,3]-SO.posterior[,3], c(0.025,0.975))*100,0)[2]

# create complete character string for table
dis.strong.CI <- paste("(", dis.strong.lci," to ",dis.strong.uci, ")", sep="")

# blank spots here because the summary of differences will go below here
dis.strong[4] <- ""
dis.strong.CI[4] <- ""

# empty vectors for somewhat disapprove means and credible intervals
dis.somewhat <- rep(NA, 4)
dis.somewhat.lci <- rep(NA, 4)
dis.somewhat.uci <- rep(NA, 4)

## somewhat disapprove posterior mean and CI empty threat
dis.somewhat[1] <- round(mean(ET.posterior[,4])*100,0)
dis.somewhat.lci[1] <- round(quantile(ET.posterior[,4], c(0.025,0.975))*100,0)[1]
dis.somewhat.uci[1] <- round(quantile(ET.posterior[,4], c(0.025,0.975))*100,0)[2]

## somewhat disapprove posterior mean and CI stay out
dis.somewhat[2] <- round(mean(SO.posterior[,4])*100,0)
dis.somewhat.lci[2] <- round(quantile(SO.posterior[,4], c(0.025,0.975))*100,0)[1]
dis.somewhat.uci[2] <- round(quantile(SO.posterior[,4], c(0.025,0.975))*100,0)[2]

## somewhat disapprove posterior mean and CI difference
dis.somewhat[3] <- round(mean(ET.posterior[,4]-SO.posterior[,4])*100,0)
dis.somewhat.lci[3] <- round(quantile(ET.posterior[,4]-SO.posterior[,4], c(0.025,0.975))*100,0)[1]
dis.somewhat.uci[3] <- round(quantile(ET.posterior[,4]-SO.posterior[,4], c(0.025,0.975))*100,0)[2]

## summary of disapprove differences
dis.somewhat[4] <- round(mean(ET.posterior[,3:4]-SO.posterior[,3:4])*100,0)
dis.somewhat.lci[4] <- round(quantile(ET.posterior[,3:4]-SO.posterior[,3:4], c(0.025,0.975))*100,0)[1]
dis.somewhat.uci[4] <- round(quantile(ET.posterior[,3:4]-SO.posterior[,3:4], c(0.025,0.975))*100,0)[2]

# create complete character string for table
dis.somewhat.CI <- paste("(", dis.somewhat.lci," to ",dis.somewhat.uci, ")", sep="")

# empty vectors for somewhat disapprove means and credible intervals
lean.dis <- rep(NA, 4)
lean.dis.lci <- rep(NA, 4)
lean.dis.uci <- rep(NA, 4)

## lean disapprove posterior mean and CI empty threat
lean.dis[1] <- round(mean(ET.posterior[,6])*100,0)
lean.dis.lci[1] <- round(quantile(ET.posterior[,6], c(0.025,0.975))*100,0)[1]
lean.dis.uci[1] <- round(quantile(ET.posterior[,6], c(0.025,0.975))*100,0)[2]

## lean disapprove posterior mean and CI stay out
lean.dis[2] <- round(mean(SO.posterior[,6])*100,0)
lean.dis.lci[2] <- round(quantile(SO.posterior[,6], c(0.025,0.975))*100,0)[1]
lean.dis.uci[2] <- round(quantile(SO.posterior[,6], c(0.025,0.975))*100,0)[2]

## lean disapprove posterior mean and CI difference
lean.dis[3] <- round(mean(ET.posterior[,6]-SO.posterior[,6])*100,0)
lean.dis.lci[3] <- round(quantile(ET.posterior[,6]-SO.posterior[,6], c(0.025,0.975))*100,0)[1]
lean.dis.uci[3] <- round(quantile(ET.posterior[,6]-SO.posterior[,6], c(0.025,0.975))*100,0)[2]

# create complete character string for table
lean.dis.CI <- paste("(", lean.dis.lci," to ",lean.dis.uci, ")", sep="")

# nothing in this column, summary of differences in row below
lean.dis[4] <- ""
lean.dis.CI[4] <- ""

# empty vectors for not leaning either way means and credible intervals
no.lean <- rep(NA, 4)
no.lean.lci <- rep(NA, 4)
no.lean.uci <- rep(NA, 4)

## not leaning either way posterior mean and CI empty threat
no.lean[1] <- round(mean(ET.posterior[,7])*100,0)
no.lean.lci[1] <- round(quantile(ET.posterior[,7], c(0.025,0.975))*100,0)[1]
no.lean.uci[1] <- round(quantile(ET.posterior[,7], c(0.025,0.975))*100,0)[2]

## not leaning either way posterior mean and CI stay out
no.lean[2] <- round(mean(SO.posterior[,7])*100,0)
no.lean.lci[2] <- round(quantile(SO.posterior[,7], c(0.025,0.975))*100,0)[1]
no.lean.uci[2] <- round(quantile(SO.posterior[,7], c(0.025,0.975))*100,0)[2]

## not leaning either way posterior mean and CI difference
no.lean[3] <- round(mean(ET.posterior[,7]-SO.posterior[,7])*100,0)
no.lean.lci[3] <- round(quantile(ET.posterior[,7]-SO.posterior[,7], c(0.025,0.975))*100,0)[1]
no.lean.uci[3] <- round(quantile(ET.posterior[,7]-SO.posterior[,7], c(0.025,0.975))*100,0)[2]

## summary of neither differences
no.lean[4] <- round(mean(ET.posterior[,5:7]-SO.posterior[,5:7])*100,0)
no.lean.lci[4] <- round(quantile(ET.posterior[,5:7]-SO.posterior[,5:7], c(0.025,0.975))*100,0)[1]
no.lean.uci[4] <- round(quantile(ET.posterior[,5:7]-SO.posterior[,5:7], c(0.025,0.975))*100,0)[2]

# create complete character string for table
no.lean.CI <- paste("(", no.lean.lci," to ", no.lean.uci, ")", sep="")

# empty vectors for somewhat approve means and credible intervals
lean.app <- rep(NA, 4)
lean.app.lci <- rep(NA, 4)
lean.app.uci <- rep(NA, 4)

## lean approve posterior mean and CI empty threat
lean.app[1] <- round(mean(ET.posterior[,5])*100,0)
lean.app.lci[1] <- round(quantile(ET.posterior[,5], c(0.025,0.975))*100,0)[1]
lean.app.uci[1] <- round(quantile(ET.posterior[,5], c(0.025,0.975))*100,0)[2]

## lean approve posterior mean and CI stay out
lean.app[2] <- round(mean(SO.posterior[,5])*100,0)
lean.app.lci[2] <- round(quantile(SO.posterior[,5], c(0.025,0.975))*100,0)[1]
lean.app.uci[2] <- round(quantile(SO.posterior[,5], c(0.025,0.975))*100,0)[2]

## lean approve posterior mean and CI difference
lean.app[3] <- round(mean(ET.posterior[,5]-SO.posterior[,5])*100,0)
lean.app.lci[3] <- round(quantile(ET.posterior[,5]-SO.posterior[,5], c(0.025,0.975))*100,0)[1]
lean.app.uci[3] <- round(quantile(ET.posterior[,5]-SO.posterior[,5], c(0.025,0.975))*100,0)[2]

# create complete character string for table
lean.app.CI <- paste("(", lean.app.lci," to ",lean.app.uci, ")", sep="")

# nothing in this column, summary of differences in row below
lean.app[4] <- ""
lean.app.CI[4] <- ""

# empty vectors for somewhat approve means and credible intervals
app.somewhat <- rep(NA, 4)
app.somewhat.lci <- rep(NA, 4)
app.somewhat.uci <- rep(NA, 4)

## somewhat approve posterior mean and CI empty threat
app.somewhat[1] <- round(mean(ET.posterior[,2])*100,0)
app.somewhat.lci[1] <- round(quantile(ET.posterior[,2], c(0.025,0.975))*100,0)[1]
app.somewhat.uci[1] <- round(quantile(ET.posterior[,2], c(0.025,0.975))*100,0)[2]

## somewhat approve posterior mean and CI stay out
app.somewhat[2] <- round(mean(SO.posterior[,2])*100,0)
app.somewhat.lci[2] <- round(quantile(SO.posterior[,2], c(0.025,0.975))*100,0)[1]
app.somewhat.uci[2] <- round(quantile(SO.posterior[,2], c(0.025,0.975))*100,0)[2]

## somewhat approve posterior mean and CI difference
app.somewhat[3] <- round(mean(ET.posterior[,2]-SO.posterior[,2])*100,0)
app.somewhat.lci[3] <- round(quantile(ET.posterior[,2]-SO.posterior[,2], c(0.025,0.975))*100,0)[1]
app.somewhat.uci[3] <- round(quantile(ET.posterior[,2]-SO.posterior[,2], c(0.025,0.975))*100,0)[2]

# create complete character string for table
app.somewhat.CI <- paste("(", app.somewhat.lci," to ",app.somewhat.uci, ")", sep="")

# nothing in this column, summary of differences in row below
app.somewhat[4] <- ""
app.somewhat.CI[4] <- ""

# empty vectors for strongly approve means and credible intervals
app.strong <- rep(NA, 4)
app.strong.lci <- rep(NA, 4)
app.strong.uci <- rep(NA, 4)

## strongly approve posterior mean and CI empty threat
app.strong[1] <- round(mean(ET.posterior[,1])*100,0)
app.strong.lci[1] <- round(quantile(ET.posterior[,1], c(0.025,0.975))*100,0)[1]
app.strong.uci[1] <- round(quantile(ET.posterior[,1], c(0.025,0.975))*100,0)[2]

## strongly approve posterior mean and CI stay out
app.strong[2] <- round(mean(SO.posterior[,1])*100,0)
app.strong.lci[2] <- round(quantile(SO.posterior[,1], c(0.025,0.975))*100,0)[1]
app.strong.uci[2] <- round(quantile(SO.posterior[,1], c(0.025,0.975))*100,0)[2]

## strongly approve posterior mean and CI difference
app.strong[3] <- round(mean(ET.posterior[,1]-SO.posterior[,1])*100,0)
app.strong.lci[3] <- round(quantile(ET.posterior[,1]-SO.posterior[,1], c(0.025,0.975))*100,0)[1]
app.strong.uci[3] <- round(quantile(ET.posterior[,1]-SO.posterior[,1], c(0.025,0.975))*100,0)[2]

## summary of approve differences
app.strong[4] <- round(mean(ET.posterior[,1:2]-SO.posterior[,1:2])*100,0)
app.strong.lci[4] <- round(quantile(ET.posterior[,1:2]-SO.posterior[,1:2], c(0.025,0.975))*100,0)[1]
app.strong.uci[4] <- round(quantile(ET.posterior[,1:2]-SO.posterior[,1:2], c(0.025,0.975))*100,0)[2]

# create complete character string for table
app.strong.CI <- paste("(", app.strong.lci," to ",app.strong.uci, ")", sep="")

# rbind each row together, alternating point estimates and credible intervals for those estimates
tomz_table_replication <- rbind(dis.strong, dis.strong.CI,
                                dis.somewhat, dis.somewhat.CI,
                                lean.dis, lean.dis.CI,
                                no.lean, no.lean.CI,
                                lean.app, lean.app.CI,
                                app.somewhat, app.somewhat.CI,
                                app.strong, app.strong.CI)

# create row names to be used in table
tomz_row_names <- c("Disapprove very strongly", "", 
                    "Disapprove somewhat", "",
                    "Lean toward disapproving", "",
                    "Don't lean either way", "",
                    "Lean toward approving", "",
                    "Approve somewhat", "",
                    "Approve very strongly", "")

# cbind row names to rest of table
tomz_table_replication <- cbind(tomz_row_names, tomz_table_replication)

# label columns for table, first should be empty
cols <- c(" ", "Empty threat (%)", "Staying out (%)", 
          "Difference in opinion (%)", "Summary of differences (%)")

# label rows, the blank row names are for the credible intervals row which we don't want labeled
rows <- rep("", 14)

# combine all together into matrix for xtable
S3_TabD <- matrix(tomz_table_replication,14,5,dimnames = list(rows,cols))

# use xtable to produce latex code for table (we don't want row names here)
print(xtable(S3_TabD, align = c("l","l","c","c","c","c")), include.rownames = FALSE,
      size="\\fontsize{9pt}{10pt}\\selectfont", file = getOption("S3_TableD.file", "S3_TableD"))

###############################################
## End S3 Appendix; Table D: DLABSS Portion ##
##############################################

###########################
## S4 Appendix; Table A ###
###########################

my.fun <- function(modelname){
  frame <- as.data.frame(model.frame(modelname))
  comb.n <- nrow(frame)
  fep.n <- nrow(frame[which(frame$survey == 1), ])
  sec.n <- nrow(frame[which(frame$survey == 0), ])
  mturk.n <- nrow(frame[which(frame$volunteer == 0), ])
  vol.n <- nrow(frame[which(frame$volunteer == 1), ])
  return(as.vector(c(comb.n, fep.n, sec.n, mturk.n, vol.n )))
}

# these are the regressions used to create figure 2 above

input1 <- c2.reg5
input2 <- c2.reg4
input3 <- c2.reg2
input4 <-  c2.reg3
input5 <- c2.reg1i 
input6 <- effort.out
input7 <- c2.reg11 
input8 <- c2.reg10
input9 <- quality.out
input10 <- c2.reg8
input11 <- c2.reg7
input12 <- c2.reg9
input13 <- c2.reg6
input14 <- c.att.out

samplesizes <- as.tabular(rbind(my.fun(input1), my.fun(input2), my.fun(input3), my.fun(input4), my.fun(input5), 
                                my.fun(input6), my.fun(input7), my.fun(input8), my.fun(input9), my.fun(input10),
                                my.fun(input11), my.fun(input12), my.fun(input13), my.fun(input14)))


#replacing sample size values for time reading prompt + answering short items to reflect FEP survey only
samplesizes[3:4, c(1,3)]  <- NA

#adjusting attention check to reflect sec only
samplesizes[14, 3] <- samplesizes[14, 1 ]
samplesizes[14, c(1,2)]  <- NA


colnames(samplesizes) <- c("Combined", "FEP", "Secular", "MTurk", "Volunteer")
rownames(samplesizes) <- c(  "Time answering open-ended 2", "Time answering open-ended 1", "Time reading prompt", "Time answering short items", "Straightlining", "Open-ended Response Effort", "Open-ended question 2 length", "Open-ended question 1 length" , "Open-ended Response Quality", "Commital question 2",  "Commital question 1", "Consistency", "Skipping", "Attention Check" )

sampx <- as.table(samplesizes)

addtorow <- list()
addtorow$pos <- list(-1)
addtorow$command <- c("& Sample Size &\\multicolumn{2}{c}{By Survey Topic} & \\multicolumn{2}{c}{By Volunteer Status} \\\\\n")

# output latex code for table
print(xtable(sampx, digits=0, align = c("r", "|", "c","|", "c", "c","|","c","c")),  add.to.row =  addtorow, include.colnames = T,  
      include.rownames = T, file = getOption("S4_TableA.file", "S4_TableA"))

###############################
## End S4 Appendix; Table A ###
###############################

###############################
## S4 Appendix; Table B    ####
###############################

# read in data
# data used for response quality analysis regressions is more recent than data used to 
# obtain demographics for the table presented in the online appendix, therefore the N will be
# larger and values slightly different
data <- read.csv("datasets/response_quality_data.csv")

# empty vectors to store results for table
dlabss_qual <- rep(NA, 19)
dlabss_qual_ses <- rep(NA, 19)
dlabss_n <- rep(NA, 18)

mturk_qual <- rep(NA, 19)
mturk_qual_ses <- rep(NA, 19)
mturk_n <- rep(NA, 18)

# DLABSS female
dlabss_qual[1] <- mean(data$female[data$volunteer==1], na.rm = T)*100
dlabss_qual_ses[1] <- SE.prop(data$female[data$volunteer==1])*100
mean(is.na(data$female[data$volunteer==1]))
dlabss_n[1] <- length(na.omit(data$female[data$volunteer==1]))

# Mturk female
mturk_qual[1] <- mean(data$female[data$volunteer==0], na.rm = T)*100
mturk_qual_ses[1] <- SE.prop(data$female[data$volunteer==0])*100
mean(is.na(data$female[data$volunteer==0]))
mturk_n[1] <- length(na.omit(data$female[data$volunteer==0]))

# DLABSS edu.
dlabss_qual[2] <- mean(data$Education[data$volunteer==1], na.rm = T)
dlabss_qual_ses[2] <- SE(data$Education[data$volunteer==1])
mean(is.na(data$Education[data$volunteer==1]))
dlabss_n[2] <- length(na.omit(data$Education[data$volunteer==1]))

# MTurk edu.
mturk_qual[2] <- mean(data$Education[data$volunteer==0], na.rm = T)
mturk_qual_ses[2] <- SE(data$Education[data$volunteer==0])
mean(is.na(data$Education[data$volunteer==0]))
mturk_n[2] <- length(na.omit(data$Education[data$volunteer==0]))

# DLABSS age
dlabss_qual[3] <- mean(data$age.years[data$volunteer==1], na.rm = T)
dlabss_qual_ses[3] <- SE(data$age.years[data$volunteer==1])
mean(is.na(data$age.years[data$volunteer==1]))
dlabss_n[3] <- length(na.omit(data$age.years[data$volunteer==1]))

# MTurk age
mturk_qual[3] <- mean(data$age.years[data$volunteer==0], na.rm = T)
mturk_qual_ses[3] <- SE(data$age.years[data$volunteer==0])
mean(is.na(data$age.years[data$volunteer==0]))
mturk_n[3] <- length(na.omit(data$age.years[data$volunteer==0]))

# DLABSS income
dlabss_qual[4] <- mean(data$income[data$volunteer==1], na.rm = T)
dlabss_qual[5] <- median(data$income[data$volunteer==1], na.rm = T)
dlabss_qual_ses[4] <- SE(data$income[data$volunteer==1])
dlabss_qual_ses[5] <- ""
mean(is.na(data$income[data$volunteer==1]))
dlabss_n[4] <- length(na.omit(data$income[data$volunteer==1]))

# MTurk income
mturk_qual[4] <- mean(data$income[data$volunteer==0], na.rm = T)
mturk_qual[5] <- median(data$income[data$volunteer==0], na.rm = T)
mturk_qual_ses[4] <- SE(data$income[data$volunteer==0])
mturk_qual_ses[5] <- ""
mean(is.na(data$income[data$volunteer==0]))
mturk_n[4] <- length(na.omit(data$income[data$volunteer==0]))

## DLABSS race white
dlabss_qual[6] <- mean(data$Race[data$volunteer==1]=="white",na.rm=T)*100
dlabss_qual_ses[6] <- SE.prop(data$Race[data$volunteer==1]=="white")*100
mean(is.na(data$Race[data$volunteer==1]))
dlabss_n[6] <- length(na.omit(data$Race[data$volunteer==1]))

## MTurk race white
mturk_qual[6] <- mean(data$Race[data$volunteer==0]=="white",na.rm=T)*100
mturk_qual_ses[6] <- SE.prop(data$Race[data$volunteer==0]=="white")*100
mean(is.na(data$Race[data$volunteer==0]))
mturk_n[6] <- length(na.omit(data$Race[data$volunteer==0]))

## DLABSS race black
dlabss_qual[7] <- mean(data$Race[data$volunteer==1]=="black",na.rm=T)*100
dlabss_qual_ses[7] <- SE.prop(data$Race[data$volunteer==1]=="black")*100

## MTurk race black
mturk_qual[7] <- mean(data$Race[data$volunteer==0]=="black",na.rm=T)*100
mturk_qual_ses[7] <- SE.prop(data$Race[data$volunteer==0]=="black")*100


## DLABSS race hispanic
dlabss_qual[8] <- mean(data$Race[data$volunteer==1]=="hispanic",na.rm=T)*100
dlabss_qual_ses[8] <- SE.prop(data$Race[data$volunteer==1]=="hispanic")*100

## MTurk race hispanic
mturk_qual[8] <- mean(data$Race[data$volunteer==0]=="hispanic",na.rm=T)*100
mturk_qual_ses[8] <- SE.prop(data$Race[data$volunteer==0]=="hispanic")*100

# DLABSS religiosity
dlabss_qual[9] <- mean(data$services[data$volunteer==1& data$survey==0], na.rm = T)*100
dlabss_qual_ses[9] <- SE.prop(data$services[data$volunteer==1& data$survey==0])*100
mean(is.na(data$services[data$volunteer==1& data$survey==0]))
dlabss_n[9] <- length(na.omit(data$services[data$volunteer==1]))

# MTurk religiosity
mturk_qual[9] <- mean(data$services[data$volunteer==0& data$survey==0], na.rm = T)*100
mturk_qual_ses[9] <- SE.prop(data$services[data$volunteer==0& data$survey==0])*100
mean(is.na(data$services[data$volunteer==0& data$survey==0]))
mturk_n[9] <- length(na.omit(data$services[data$volunteer==0]))

# DLABSS protestant
dlabss_qual[10] <- mean(data$prot[data$volunteer==1 & data$survey==0], na.rm = T)*100
dlabss_qual_ses[10] <- SE.prop(data$prot[data$volunteer==1& data$survey==0])*100
mean(is.na(data$prot[data$volunteer==1& data$survey==0]))
dlabss_n[10] <- length(na.omit(data$prot[data$volunteer==1& data$survey==0]))

# MTurk protestant
mturk_qual[10] <- mean(data$prot[data$volunteer==0& data$survey==0], na.rm = T)*100
mturk_qual_ses[10] <- SE.prop(data$prot[data$volunteer==0& data$survey==0])*100
mean(is.na(data$prot[data$volunteer==0& data$survey==0]))

# DLABSS other_chr
dlabss_qual[11] <- mean(data$other_chr[data$volunteer==1 & data$survey==0], na.rm = T)*100
dlabss_qual_ses[11] <- SE.prop(data$other_chr[data$volunteer==1& data$survey==0])*100
mean(is.na(data$other_chr[data$volunteer==1& data$survey==0]))

# MTurk other_chr
mturk_qual[11] <- mean(data$other_chr[data$volunteer==0& data$survey==0], na.rm = T)*100
mturk_qual_ses[11] <- SE.prop(data$other_chr[data$volunteer==0& data$survey==0])*100
mean(is.na(data$other_chr[data$volunteer==0& data$survey==0]))

# DLABSS other_non
dlabss_qual[12] <- mean(data$other_non[data$volunteer==1 & data$survey==0], na.rm = T)*100
dlabss_qual_ses[12] <- SE.prop(data$other_non[data$volunteer==1& data$survey==0])*100
mean(is.na(data$other_non[data$volunteer==1& data$survey==0]))

# MTurk other_non
mturk_qual[12] <- mean(data$other_non[data$volunteer==0& data$survey==0], na.rm = T)*100
mturk_qual_ses[12] <- SE.prop(data$other_non[data$volunteer==0& data$survey==0])*100
mean(is.na(data$other_non[data$volunteer==0& data$survey==0]))

# DLABSS ath_ag_none
dlabss_qual[13] <- mean(data$ath_ag_none[data$volunteer==1 & data$survey==0], na.rm = T)*100
dlabss_qual_ses[13] <- SE.prop(data$ath_ag_none[data$volunteer==1& data$survey==0])*100
mean(is.na(data$ath_ag_none[data$volunteer==1& data$survey==0]))

# MTurk ath_ag_none
mturk_qual[13] <- mean(data$ath_ag_none[data$volunteer==0& data$survey==0], na.rm = T)*100
mturk_qual_ses[13] <- SE.prop(data$ath_ag_none[data$volunteer==0& data$survey==0])*100
mean(is.na(data$ath_ag_none[data$volunteer==0& data$survey==0]))

## Party id
# DLABSS democrat
dlabss_qual[14] <- mean(data$partyID[data$volunteer==1]=='Democrat',na.rm = T)*100
dlabss_qual_ses[14] <- SE.prop(data$partyID[data$volunteer==1]=='Democrat')*100
mean(is.na(data$partyID[data$volunteer==1]))
dlabss_n[14] <- length(na.omit(data$partyID[data$volunteer==1]))

# mturk democrat
mturk_qual[14] <- mean(data$partyID[data$volunteer==0]=='Democrat',na.rm = T)*100
mturk_qual_ses[14] <- SE.prop(data$partyID[data$volunteer==0]=='Democrat')*100
mean(is.na(data$partyID[data$volunteer==0]))
mturk_n[14] <- length(na.omit(data$partyID[data$volunteer==0]))

# DLABSS republican 
dlabss_qual[15] <- mean(data$partyID[data$volunteer==1]=='Republican',na.rm = T)*100
dlabss_qual_ses[15] <- SE.prop(data$partyID[data$volunteer==1]=='Republican')*100

# mturk republican 
mturk_qual[15] <- mean(data$partyID[data$volunteer==0]=='Republican',na.rm = T)*100
mturk_qual_ses[15] <- SE.prop(data$partyID[data$volunteer==0]=='Republican')*100

# DLABSS independent
dlabss_qual[16] <- mean(data$partyID[data$volunteer==1]=='Independent',na.rm = T)*100
dlabss_qual_ses[16] <- SE.prop(data$partyID[data$volunteer==1]=='Independent')*100

# mturk independent
mturk_qual[16] <- mean(data$partyID[data$volunteer==0]=='Independent',na.rm = T)*100
mturk_qual_ses[16] <- SE.prop(data$partyID[data$volunteer==0]=='Independent')*100

# DLABSS ideology
dlabss_qual[17] <- mean(data$liberal[data$volunteer==1& data$survey==0], na.rm = T)*100
dlabss_qual_ses[17] <- SE.prop(data$liberal[data$volunteer==1& data$survey==0])*100
mean(is.na(data$liberal[data$volunteer==1& data$survey==0]))
dlabss_n[17] <- length(na.omit(data$liberal[data$volunteer==1]))

# MTurk ideology
mturk_qual[17] <- mean(data$liberal[data$volunteer==0& data$survey==0], na.rm = T)*100
mturk_qual_ses[17] <- SE.prop(data$liberal[data$volunteer==0& data$survey==0])*100
mean(is.na(data$liberal[data$volunteer==0& data$survey==0]))
mturk_n[17] <- length(na.omit(data$liberal[data$volunteer==0]))

# DLABSS english
dlabss_qual[18] <- mean(data$English[data$volunteer==1& data$survey==0], na.rm = T)*100
dlabss_qual_ses[18] <- SE.prop(data$English[data$volunteer==1& data$survey==0])*100
mean(is.na(data$English[data$volunteer==1& data$survey==0]))
length(na.omit(data$English[data$volunteer==1]))

# MTurk english
mturk_qual[18] <- mean(data$English[data$volunteer==0& data$survey==0], na.rm = T)*100
mturk_qual_ses[18] <- SE.prop(data$English[data$volunteer==0& data$survey==0])*100
mean(is.na(data$English[data$volunteer==0& data$survey==0]))
mturk_n[18] <- length(na.omit(data$English[data$volunteer==0]))

# format results for table

dlabss_qual_ses <- round(as.numeric(dlabss_qual_ses),1)
dlabss_qual_ses[5] <- ""

# put means and SEs together nicely for table column
dl_qual_msd <- paste(round(dlabss_qual,1)," (",dlabss_qual_ses,")",sep="")

# add N row
dl_qual_msd[19] <- paste(min(dlabss_n, na.rm = T), "-", max(dlabss_n, na.rm = T), sep = "")

mturk_qual_ses <- round(as.numeric(mturk_qual_ses),1)
mturk_qual_ses[5] <- ""

# put means and SEs together nicely for table column
mt_qual_msd <- paste(round(mturk_qual,1)," (",mturk_qual_ses,")",sep="")

# add N row
mt_qual_msd[19] <- paste(min(mturk_n, na.rm = T), "-", max(mturk_n, na.rm = T), sep = "")


#########################################

# now combine all columns with means and SEs
all_qual_msd <- cbind(dl_qual_msd, mt_qual_msd)

# label columns for table
cols <- c("DLABSS", "MTurk")

# label rows for table
rows <- c("Female", "Education (years)", "Age (years)", "Mean income", "Median income", "White", "Black",
          "Hispanic", "Attend Services Weekly", "Protestant", "Other Christian", "Other", "None",
          "Democrat", "Independent", "Republican", "Liberal", "Speak English at Home", "N")

# combine all together into matrix for xtable
S4_TabB <- matrix(all_qual_msd,19,2,dimnames=list(rows,cols))

# use xtable to produce latex code for table
# latex code is output as a file to the working directory
print(xtable(S4_TabB, align = c("l","c","c")), file = getOption("S4_TableB.file", "S4_TableB"))

####################################
## End S4 Appendix; Table B    ####
###################################

###########################
## S4 Appendix; Figure A ##
###########################

# NOTE: These plots are made using the regressions to create Figure 2 above

#remove invalid time investment coefficients from betas.s
betas.s$std.estimate[3:4] <- NA
betas.s$std.error[3:4] <- NA
betas$std.estimate[3:4] <- NA
betas$std.error[3:4] <- NA

#remove invalid attention check from betas overall and betas.f
betas.f$std.estimate[14] <- NA
betas.f$std.error[14] <- NA
betas$std.estimate[14] <- NA
betas$std.error[14] <- NA

#overall plot with separate surveys:
coef.vec.1 <- betas$std.estimate
coef.vec.2 <- betas.f$std.estimate
coef.vec.3 <- betas.s$std.estimate
names <- betas$names
names.2 <- betas.f$names
names.3 <- betas.s$names
error.vec.1 <- betas$std.error
error.vec.2 <- betas.f$std.error
error.vec.3 <- betas.s$std.error

betas_df2 <- data.frame(term = c( "answering open-ended 2",  "answering open-ended 1", "reading prompt", "answering short items", "straightlining","response effort", "question 2 length", "question 1 length" , "response quality",  "question 2 answer", "question 1 answer", "consistency", "skipping", "attention check",
                                  "answering open-ended 2",  "answering open-ended 1", "reading prompt", "answering short items", "straightlining", "response effort", "question 2 length", "question 1 length" , "response quality", "question 2 answer", "question 1 answer", "consistency", "skipping", "attention check",
                                  "answering open-ended 2",  "answering open-ended 1", "reading prompt", "answering short items",  "straightlining","response effort", "question 2 length", "question 1 length" , "response quality",  "question 2 answer", "question 1 answer", "consistency","skipping", "attention check" ),
                        estimate = c(coef.vec.1, coef.vec.2, coef.vec.3),
                        std.error = c(error.vec.1, error.vec.2, error.vec.3),
                        model =  c( rep("Combined", 14), rep("FEP only", 14), rep("Secularism only", 14)),
                        stringsAsFactors = FALSE)

png(filename = "S4_FigureA.png", width = 900, height = 600)
p2 <- dwplot(betas_df2,
             dot_args = list(size = 3),
             whisker_args = list(lwd=1)
) +
  theme_bw() +
  geom_vline(xintercept = 0, colour = "grey60", linetype = 2) +
  theme(legend.position="none") +
  scale_colour_grey(start = 0, end = .8) +
  labs(x ="Standardized Coefficient on Volunteer (higher values = higher quality)")+
  #geom_point(size=2, pch=16)+
  theme(legend.position = c(0.8, 0.1),
        legend.justification = c(0, 0), 
        legend.background = element_rect(colour="grey80"),
        legend.title = element_blank()) +
  theme(axis.text = element_text(size = 14),
        legend.text=element_text(size=14),
        legend.title = element_text(size=0),
        axis.title = element_text(size = 14),
        text = element_text(size=14))+
  theme(axis.title.x = element_text(margin = margin(t = 20, r = 0, b = 0, l = 0)))

# Add brackets
p2 %>% add_brackets(list(c("Time Investment", "answering open-ended 2", "answering short items"), 
                         c("Open-Ended Questions", "response effort", "response quality" ),
                         c("Committal", "question 2 answer", "question 1 answer")))

dev.off()

###############################
## End S4 Appendix; Figure A ##
###############################

###########################
## S4 Appendix; Figure B ##
###########################

source("source_files/overall_plots_no_controls.R")

#remove invalid time investment coefficients from betas.s
betas.s$std.estimate[3:4] <- NA
betas.s$std.error[3:4] <- NA
betas$std.estimate[3:4] <- NA
betas$std.error[3:4] <- NA

#remove invalid attention check from betas overall and betas.f
betas.f$std.estimate[14] <- NA
betas.f$std.error[14] <- NA
betas$std.estimate[14] <- NA
betas$std.error[14] <- NA

#overall plot with separate surveys:
coef.vec.1 <- betas$std.estimate
coef.vec.2 <- betas.f$std.estimate
coef.vec.3 <- betas.s$std.estimate
names <- betas$names
names.2 <- betas.f$names
names.3 <- betas.s$names
error.vec.1 <- betas$std.error
error.vec.2 <- betas.f$std.error
error.vec.3 <- betas.s$std.error

betas_df2 <- data.frame(term = c( "answering open-ended 2",  "answering open-ended 1", "reading prompt", "answering short items", "straightlining","response effort", "question 2 length", "question 1 length" , "response quality",  "question 2 answer", "question 1 answer", "consistency", "skipping", "attention check",
                                  "answering open-ended 2",  "answering open-ended 1", "reading prompt", "answering short items", "straightlining", "response effort", "question 2 length", "question 1 length" , "response quality", "question 2 answer", "question 1 answer", "consistency", "skipping", "attention check",
                                  "answering open-ended 2",  "answering open-ended 1", "reading prompt", "answering short items",  "straightlining","response effort", "question 2 length", "question 1 length" , "response quality",  "question 2 answer", "question 1 answer", "consistency","skipping", "attention check" ),
                        estimate = c(coef.vec.1, coef.vec.2, coef.vec.3),
                        std.error = c(error.vec.1, error.vec.2, error.vec.3),
                        model =  c( rep("Combined", 14), rep("FEP only", 14), rep("Secularism only", 14)),
                        stringsAsFactors = FALSE)

png(filename = "S4_Figure2.png", width = 900, height = 600)
p2 <- dwplot(betas_df2,
             dot_args = list(size = 3),
             whisker_args = list(lwd=1)
) +
  theme_bw() +
  geom_vline(xintercept = 0, colour = "grey60", linetype = 2) +
  theme(legend.position="none") +
  scale_colour_grey(start = 0, end = .8) +
  labs(x ="Standardized Coefficient on Volunteer (higher values = higher quality)")+
  #geom_point(size=2, pch=16)+
  theme(legend.position = c(0.8, 0.1),
        legend.justification = c(0, 0), 
        legend.background = element_rect(colour="grey80"),
        legend.title = element_blank()) +
  theme(axis.text = element_text(size = 14),
        legend.text=element_text(size=14),
        legend.title = element_text(size=0),
        axis.title = element_text(size = 14),
        text = element_text(size=14))+
  theme(axis.title.x = element_text(margin = margin(t = 20, r = 0, b = 0, l = 0)))

# Add brackets
p2 %>% add_brackets(list(c("Time Investment", "answering open-ended 2", "answering short items"), 
                         c("Open-Ended Questions", "response effort", "response quality" ),
                         c("Committal", "question 2 answer", "question 1 answer")))

dev.off()

###############################
## End S4 Appendix; Figure B ##
###############################

###########################
## S4 Appendix; Figure D ##
###########################

# read in attrition rate data
att_rate <- read.csv("datasets/dlabss_attrition_time.csv")

# get mean and median of time to complete surveys in minutes
vline = data.frame(xint = c(mean(att_rate$survey.time), median(att_rate$survey.time)), 
                   Legend = c("Mean", "Median"))

# plot histogram of time to complete survey in minutes
ggplot(att_rate, aes(x=survey.time)) + geom_histogram(bins=20) + 
  theme(axis.title.x = element_text(size=22),
        axis.title.y = element_text(size=22),
        axis.text = element_text(size = 20)) +
  xlab("Mean Time to Complete (minutes)") + ylab("") +
  geom_hline(yintercept=c(2.5,5,7.5,10,12.5,15),  colour="white",
             linetype="dashed", size=.5) + ylim(0, 16) +
  geom_vline(data = vline,aes(xintercept=xint,linetype = Legend),color = "black",size=1,show.legend = TRUE) + 
  scale_colour_discrete(guide = "none") +
  theme(legend.text = element_text(size = 18), legend.title = element_text(size=18))

# get mean and median for proportion of people who drop out of surveys (attrition rate)
vline = data.frame(xint = c(mean(att_rate$survey.attrition), median(att_rate$survey.attrition)), 
                   Legend = c("Mean", "Median"))

# plot histogram of attrition rates across surveys 
ggplot(att_rate, aes(x=survey.attrition)) + geom_histogram(bins=20) + 
  theme(legend.position='none',
        axis.title.x = element_text(size=22),
        axis.title.y = element_text(size=22),
        axis.text = element_text(size = 20)) +
  xlab("Survey Attrition Rate") + ylab("Frequency") +
  geom_hline(yintercept=c(2.5,5,7.5,10,12.5,15,17.5,20,22.5),  colour="white",
             linetype="dashed", size=.5) + ylim(0, 25) +
  geom_vline(data = vline,aes(xintercept=xint,linetype = Legend),color = "black",size=1,show.legend = TRUE) + 
  scale_colour_discrete(guide = "none") +
  theme(legend.text = element_text(size = 18), legend.title = element_text(size=18))

###############################
## End S4 Appendix; Figure D ##
###############################
